From c03ac54fbf82cc7458a0a6594b655c1dd86d4e37 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 16 Aug 2018 16:34:05 +0100 Subject: [PATCH 1/2] [ML] Remove old per-partition normalization code Per-partition normalization is an old, undocumented feature that was never used by clients. It has been superseded by per-partition maximum scoring (see #32748). This PR removes the now redundant code. Relates https://github.com/elastic/elasticsearch/pull/32816 --- bin/autodetect/CCmdLineParser.cc | 6 -- bin/autodetect/CCmdLineParser.h | 1 - bin/autodetect/Main.cc | 9 +-- bin/normalize/CCmdLineParser.cc | 8 +- bin/normalize/CCmdLineParser.h | 3 +- bin/normalize/Main.cc | 9 +-- include/api/CHierarchicalResultsWriter.h | 10 +-- include/api/CJsonOutputWriter.h | 7 -- include/api/CResultNormalizer.h | 9 --- include/model/CAnomalyDetectorModelConfig.h | 9 --- lib/api/CHierarchicalResultsWriter.cc | 29 -------- lib/api/CJsonOutputWriter.cc | 47 ------------ lib/api/CResultNormalizer.cc | 35 +-------- lib/api/unittest/CJsonOutputWriterTest.cc | 74 ------------------- lib/api/unittest/CJsonOutputWriterTest.h | 1 - lib/model/CAnomalyDetectorModelConfig.cc | 14 +--- lib/model/CHierarchicalResultsNormalizer.cc | 3 +- .../CAnomalyDetectorModelConfigTest.cc | 2 - 18 files changed, 15 insertions(+), 261 deletions(-) diff --git a/bin/autodetect/CCmdLineParser.cc b/bin/autodetect/CCmdLineParser.cc index 2f48cbbeb7..5fa131cffe 100644 --- a/bin/autodetect/CCmdLineParser.cc +++ b/bin/autodetect/CCmdLineParser.cc @@ -53,7 +53,6 @@ bool CCmdLineParser::parse(int argc, std::size_t& bucketResultsDelay, bool& multivariateByFields, std::string& multipleBucketspans, - bool& perPartitionNormalization, TStrVec& clauseTokens) { try { boost::program_options::options_description desc(DESCRIPTION); @@ -119,8 +118,6 @@ bool CCmdLineParser::parse(int argc, "Optional flag to enable multi-variate analysis of correlated by fields") ("multipleBucketspans", boost::program_options::value(), "Optional comma-separated list of additional bucketspans - must be direct multiples of the main bucketspan") - ("perPartitionNormalization", - "Optional flag to enable per partition normalization") ; // clang-format on @@ -237,9 +234,6 @@ bool CCmdLineParser::parse(int argc, if (vm.count("multipleBucketspans") > 0) { multipleBucketspans = vm["multipleBucketspans"].as(); } - if (vm.count("perPartitionNormalization") > 0) { - perPartitionNormalization = true; - } boost::program_options::collect_unrecognized( parsed.options, boost::program_options::include_positional) diff --git a/bin/autodetect/CCmdLineParser.h b/bin/autodetect/CCmdLineParser.h index cb80c82075..ed8ec94dfd 100644 --- a/bin/autodetect/CCmdLineParser.h +++ b/bin/autodetect/CCmdLineParser.h @@ -65,7 +65,6 @@ class CCmdLineParser { std::size_t& bucketResultsDelay, bool& multivariateByFields, std::string& multipleBucketspans, - bool& perPartitionNormalization, TStrVec& clauseTokens); private: diff --git a/bin/autodetect/Main.cc b/bin/autodetect/Main.cc index dcf905e0a6..930399fcdf 100644 --- a/bin/autodetect/Main.cc +++ b/bin/autodetect/Main.cc @@ -89,7 +89,6 @@ int main(int argc, char** argv) { std::size_t bucketResultsDelay(0); bool multivariateByFields(false); std::string multipleBucketspans; - bool perPartitionNormalization(false); TStrVec clauseTokens; if (ml::autodetect::CCmdLineParser::parse( argc, argv, limitConfigFile, modelConfigFile, fieldConfigFile, @@ -97,10 +96,9 @@ int main(int argc, char** argv) { summaryCountFieldName, delimiter, lengthEncodedInput, timeField, timeFormat, quantilesStateFile, deleteStateFiles, persistInterval, maxQuantileInterval, inputFileName, isInputFileNamedPipe, outputFileName, - isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, - persistFileName, isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, - bucketResultsDelay, multivariateByFields, multipleBucketspans, - perPartitionNormalization, clauseTokens) == false) { + isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, persistFileName, + isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, bucketResultsDelay, + multivariateByFields, multipleBucketspans, clauseTokens) == false) { return EXIT_FAILURE; } @@ -148,7 +146,6 @@ int main(int argc, char** argv) { ml::model::CAnomalyDetectorModelConfig::defaultConfig( bucketSpan, summaryMode, summaryCountFieldName, latency, bucketResultsDelay, multivariateByFields, multipleBucketspans); - modelConfig.perPartitionNormalization(perPartitionNormalization); modelConfig.detectionRules(ml::model::CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMapCRef( fieldConfig.detectionRules())); modelConfig.scheduledEvents(ml::model::CAnomalyDetectorModelConfig::TStrDetectionRulePrVecCRef( diff --git a/bin/normalize/CCmdLineParser.cc b/bin/normalize/CCmdLineParser.cc index 4bb00e6ea7..30162d6bdb 100644 --- a/bin/normalize/CCmdLineParser.cc +++ b/bin/normalize/CCmdLineParser.cc @@ -30,8 +30,7 @@ bool CCmdLineParser::parse(int argc, bool& isOutputFileNamedPipe, std::string& quantilesState, bool& deleteStateFiles, - bool& writeCsv, - bool& perPartitionNormalization) { + bool& writeCsv) { try { boost::program_options::options_description desc(DESCRIPTION); // clang-format off @@ -60,8 +59,6 @@ bool CCmdLineParser::parse(int argc, "If this flag is set then delete the normalizer state files once they have been read") ("writeCsv", "Write the results in CSV format (default is lineified JSON)") - ("perPartitionNormalization", - "Optional flag to enable per partition normalization") ; // clang-format on @@ -114,9 +111,6 @@ bool CCmdLineParser::parse(int argc, if (vm.count("writeCsv") > 0) { writeCsv = true; } - if (vm.count("perPartitionNormalization") > 0) { - perPartitionNormalization = true; - } } catch (std::exception& e) { std::cerr << "Error processing command line: " << e.what() << std::endl; return false; diff --git a/bin/normalize/CCmdLineParser.h b/bin/normalize/CCmdLineParser.h index 4268165918..b67c98fc2a 100644 --- a/bin/normalize/CCmdLineParser.h +++ b/bin/normalize/CCmdLineParser.h @@ -43,8 +43,7 @@ class CCmdLineParser { bool& isOutputFileNamedPipe, std::string& quantilesState, bool& deleteStateFiles, - bool& writeCsv, - bool& perPartitionNormalization); + bool& writeCsv); private: static const std::string DESCRIPTION; diff --git a/bin/normalize/Main.cc b/bin/normalize/Main.cc index b2a9ed359e..3f6ee0cb71 100644 --- a/bin/normalize/Main.cc +++ b/bin/normalize/Main.cc @@ -54,12 +54,10 @@ int main(int argc, char** argv) { std::string quantilesStateFile; bool deleteStateFiles(false); bool writeCsv(false); - bool perPartitionNormalization(false); if (ml::normalize::CCmdLineParser::parse( - argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan, - lengthEncodedInput, inputFileName, isInputFileNamedPipe, - outputFileName, isOutputFileNamedPipe, quantilesStateFile, - deleteStateFiles, writeCsv, perPartitionNormalization) == false) { + argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan, lengthEncodedInput, + inputFileName, isInputFileNamedPipe, outputFileName, isOutputFileNamedPipe, + quantilesStateFile, deleteStateFiles, writeCsv) == false) { return EXIT_FAILURE; } @@ -93,7 +91,6 @@ int main(int argc, char** argv) { LOG_FATAL(<< "Ml model config file '" << modelConfigFile << "' could not be loaded"); return EXIT_FAILURE; } - modelConfig.perPartitionNormalization(perPartitionNormalization); // There's a choice of input and output formats for the numbers to be normalised using TInputParserUPtr = std::unique_ptr; diff --git a/include/api/CHierarchicalResultsWriter.h b/include/api/CHierarchicalResultsWriter.h index 18872150df..8e88aeaa87 100644 --- a/include/api/CHierarchicalResultsWriter.h +++ b/include/api/CHierarchicalResultsWriter.h @@ -51,12 +51,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults using TStr1Vec = core::CSmallVector; public: - enum EResultType { - E_SimpleCountResult, - E_PopulationResult, - E_PartitionResult, - E_Result - }; + enum EResultType { E_SimpleCountResult, E_PopulationResult, E_Result }; //! Type which wraps up the results of anomaly detection. struct API_EXPORT SResults { //! Construct for population results @@ -168,9 +163,6 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults //! pivot. void writePivotResult(const model::CHierarchicalResults& results, const TNode& node); - //! Write partition result if \p node is a partition level result - void writePartitionResult(const model::CHierarchicalResults& results, const TNode& node); - //! Write out a simple count result if \p node is simple //! count. void writeSimpleCountResult(const TNode& node); diff --git a/include/api/CJsonOutputWriter.h b/include/api/CJsonOutputWriter.h index f83e24ca72..5a4db20daa 100644 --- a/include/api/CJsonOutputWriter.h +++ b/include/api/CJsonOutputWriter.h @@ -162,9 +162,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler { // when the number to write is limited double s_LowestBucketInfluencerScore; - //! Partition scores - TDocumentWeakPtrVec s_PartitionScoreDocuments; - //! scheduled event descriptions TStr1Vec s_ScheduledEventDescriptions; }; @@ -304,10 +301,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler { void addInfluences(const CHierarchicalResultsWriter::TStoredStringPtrStoredStringPtrPrDoublePrVec& influenceResults, TDocumentWeakPtr weakDoc); - //! Write partition score & probability - void addPartitionScores(const CHierarchicalResultsWriter::TResults& results, - TDocumentWeakPtr weakDoc); - private: //! The job ID std::string m_JobId; diff --git a/include/api/CResultNormalizer.h b/include/api/CResultNormalizer.h index 91296d3a07..d439d7b3d2 100644 --- a/include/api/CResultNormalizer.h +++ b/include/api/CResultNormalizer.h @@ -93,15 +93,6 @@ class API_EXPORT CResultNormalizer { std::string& valueFieldName, double& probability); - bool parseDataFields(const TStrStrUMap& dataRowFields, - std::string& level, - std::string& partition, - std::string& partitionValue, - std::string& person, - std::string& function, - std::string& valueFieldName, - double& probability); - template bool parseDataField(const TStrStrUMap& dataRowFields, const std::string& fieldName, diff --git a/include/model/CAnomalyDetectorModelConfig.h b/include/model/CAnomalyDetectorModelConfig.h index d6764b5561..035ba5baf2 100644 --- a/include/model/CAnomalyDetectorModelConfig.h +++ b/include/model/CAnomalyDetectorModelConfig.h @@ -424,12 +424,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { const TDoubleDoublePrVec& normalizedScoreKnotPoints() const; //@} - //! Check if we should create one normalizer per partition field value. - bool perPartitionNormalization() const; - - //! Set whether we should create one normalizer per partition field value. - void perPartitionNormalization(bool value); - //! Sets the reference to the detection rules map void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules); @@ -500,9 +494,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig { //! and the normalized anomaly score with these knot points. //! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details. TDoubleDoublePrVec m_NormalizedScoreKnotPoints; - - //! If true then create one normalizer per partition field value. - bool m_PerPartitionNormalisation; //@} //! A reference to the map containing detection rules per diff --git a/lib/api/CHierarchicalResultsWriter.cc b/lib/api/CHierarchicalResultsWriter.cc index 3daafb4cd0..5a844e9aea 100644 --- a/lib/api/CHierarchicalResultsWriter.cc +++ b/lib/api/CHierarchicalResultsWriter.cc @@ -126,7 +126,6 @@ void CHierarchicalResultsWriter::visit(const model::CHierarchicalResults& result } else { this->writePopulationResult(results, node); this->writeIndividualResult(results, node); - this->writePartitionResult(results, node); this->writeSimpleCountResult(node); } } @@ -258,34 +257,6 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST)); } -void CHierarchicalResultsWriter::writePartitionResult(const model::CHierarchicalResults& results, - const TNode& node) { - if (!m_ModelConfig.perPartitionNormalization() || this->isSimpleCount(node) || - this->isPopulation(node) || !this->isPartition(node) || - !this->shouldWriteResult(m_Limits, results, node, false)) { - return; - } - - model_t::EFeature feature = - node.s_AnnotatedProbability.s_AttributeProbabilities.empty() - ? model_t::E_IndividualCountByBucketAndPerson - : node.s_AnnotatedProbability.s_AttributeProbabilities[0].s_Feature; - - TDouble1Vec emptyDoubleVec; - - m_ResultWriterFunc(TResults( - E_PartitionResult, *node.s_Spec.s_PartitionFieldName, - *node.s_Spec.s_PartitionFieldValue, *node.s_Spec.s_ByFieldName, - *node.s_Spec.s_PersonFieldValue, EMPTY_STRING, node.s_BucketStartTime, - *node.s_Spec.s_FunctionName, model_t::outputFunctionName(feature), - node.s_AnnotatedProbability.s_BaselineBucketCount, - node.s_AnnotatedProbability.s_CurrentBucketCount, emptyDoubleVec, emptyDoubleVec, - node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(), - *node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences, - node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function), - node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST)); -} - void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results, const TNode& node) { if (this->isSimpleCount(node) || diff --git a/lib/api/CJsonOutputWriter.cc b/lib/api/CJsonOutputWriter.cc index 0acdd8e92c..a075f2639e 100644 --- a/lib/api/CJsonOutputWriter.cc +++ b/lib/api/CJsonOutputWriter.cc @@ -70,7 +70,6 @@ const std::string EXAMPLES("examples"); const std::string BUCKET_SPAN("bucket_span"); const std::string PROCESSING_TIME("processing_time_ms"); const std::string TIME_INFLUENCER("bucket_time"); -const std::string PARTITION_SCORES("partition_scores"); const std::string SCHEDULED_EVENTS("scheduled_events"); const std::string QUANTILES("quantiles"); @@ -191,14 +190,6 @@ bool CJsonOutputWriter::acceptResult(const CHierarchicalResultsWriter::TResults& return true; } - if (results.s_ResultType == CHierarchicalResultsWriter::E_PartitionResult) { - TDocumentWeakPtr partitionDoc = m_Writer.makeStorableDoc(); - this->addPartitionScores(results, partitionDoc); - bucketData.s_PartitionScoreDocuments.push_back(partitionDoc); - - return true; - } - ++bucketData.s_RecordCount; TDocumentWeakPtrIntPrVec& detectorDocumentsToWrite = bucketData.s_DocumentsToWrite; @@ -513,26 +504,6 @@ void CJsonOutputWriter::writeBucket(bool isInterim, m_Writer.EndArray(); } - if (!bucketData.s_PartitionScoreDocuments.empty()) { - // Write the array of partition-anonaly score pairs - m_Writer.String(PARTITION_SCORES); - m_Writer.StartArray(); - for (TDocumentWeakPtrVecItr partitionScoresIter = - bucketData.s_PartitionScoreDocuments.begin(); - partitionScoresIter != bucketData.s_PartitionScoreDocuments.end(); - ++partitionScoresIter) { - TDocumentWeakPtr weakDoc = *partitionScoresIter; - TDocumentPtr docPtr = weakDoc.lock(); - if (!docPtr) { - LOG_ERROR(<< "Inconsistent program state. JSON document unavailable."); - continue; - } - - m_Writer.write(*docPtr); - } - m_Writer.EndArray(); - } - m_Writer.String(PROCESSING_TIME); m_Writer.Uint64(bucketProcessingTime); @@ -816,24 +787,6 @@ void CJsonOutputWriter::addInfluencerFields(bool isBucketInfluencer, } } -void CJsonOutputWriter::addPartitionScores(const CHierarchicalResultsWriter::TResults& results, - TDocumentWeakPtr weakDoc) { - TDocumentPtr docPtr = weakDoc.lock(); - if (!docPtr) { - LOG_ERROR(<< "Inconsistent program state. JSON document unavailable."); - return; - } - - m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr); - m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_NAME, - results.s_PartitionFieldName, *docPtr); - m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_VALUE, - results.s_PartitionFieldValue, *docPtr, true); - m_Writer.addDoubleFieldToObj(INITIAL_RECORD_SCORE, - results.s_NormalizedAnomalyScore, *docPtr); - m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr); -} - void CJsonOutputWriter::limitNumberRecords(size_t count) { m_RecordOutputLimit = count; } diff --git a/lib/api/CResultNormalizer.cc b/lib/api/CResultNormalizer.cc index a20f183c6d..1545e86e86 100644 --- a/lib/api/CResultNormalizer.cc +++ b/lib/api/CResultNormalizer.cc @@ -75,18 +75,8 @@ bool CResultNormalizer::handleRecord(const TStrStrUMap& dataRowFields) { std::string valueFieldName; double probability(0.0); - bool isValidRecord(false); - if (m_ModelConfig.perPartitionNormalization()) { - isValidRecord = parseDataFields(dataRowFields, level, partition, partitionValue, - person, function, valueFieldName, probability); - } else { - isValidRecord = parseDataFields(dataRowFields, level, partition, person, - function, valueFieldName, probability); - } - - std::string partitionKey = m_ModelConfig.perPartitionNormalization() - ? partition + partitionValue - : partition; + bool isValidRecord = parseDataFields(dataRowFields, level, partition, person, + function, valueFieldName, probability); if (isValidRecord) { const model::CAnomalyScore::CNormalizer* levelNormalizer = nullptr; @@ -96,10 +86,10 @@ bool CResultNormalizer::handleRecord(const TStrStrUMap& dataRowFields) { if (level == ROOT_LEVEL) { levelNormalizer = &m_Normalizer.bucketNormalizer(); } else if (level == LEAF_LEVEL) { - levelNormalizer = m_Normalizer.leafNormalizer(partitionKey, person, + levelNormalizer = m_Normalizer.leafNormalizer(partition, person, function, valueFieldName); } else if (level == PARTITION_LEVEL) { - levelNormalizer = m_Normalizer.partitionNormalizer(partitionKey); + levelNormalizer = m_Normalizer.partitionNormalizer(partition); } else if (level == BUCKET_INFLUENCER_LEVEL) { levelNormalizer = m_Normalizer.influencerBucketNormalizer(person); } else if (level == INFLUENCER_LEVEL) { @@ -148,22 +138,5 @@ bool CResultNormalizer::parseDataFields(const TStrStrUMap& dataRowFields, this->parseDataField(dataRowFields, VALUE_FIELD_NAME, valueFieldName) && this->parseDataField(dataRowFields, PROBABILITY_NAME, probability); } - -bool CResultNormalizer::parseDataFields(const TStrStrUMap& dataRowFields, - std::string& level, - std::string& partition, - std::string& partitionValue, - std::string& person, - std::string& function, - std::string& valueFieldName, - double& probability) { - return this->parseDataField(dataRowFields, LEVEL, level) && - this->parseDataField(dataRowFields, PARTITION_FIELD_NAME, partition) && - this->parseDataField(dataRowFields, PARTITION_FIELD_VALUE, partitionValue) && - this->parseDataField(dataRowFields, PERSON_FIELD_NAME, person) && - this->parseDataField(dataRowFields, FUNCTION_NAME, function) && - this->parseDataField(dataRowFields, VALUE_FIELD_NAME, valueFieldName) && - this->parseDataField(dataRowFields, PROBABILITY_NAME, probability); -} } } diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc index 2dc3d67664..6dc129fb20 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.cc +++ b/lib/api/unittest/CJsonOutputWriterTest.cc @@ -71,8 +71,6 @@ CppUnit::Test* CJsonOutputWriterTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CJsonOutputWriterTest::testPersistNormalizer", &CJsonOutputWriterTest::testPersistNormalizer)); - suiteOfTests->addTest(new CppUnit::TestCaller( - "CJsonOutputWriterTest::testPartitionScores", &CJsonOutputWriterTest::testPartitionScores)); suiteOfTests->addTest(new CppUnit::TestCaller( "CJsonOutputWriterTest::testReportMemoryUsage", &CJsonOutputWriterTest::testReportMemoryUsage)); @@ -1548,78 +1546,6 @@ void CJsonOutputWriterTest::testPersistNormalizer() { CPPUNIT_ASSERT(quantileState.HasMember("timestamp")); } -void CJsonOutputWriterTest::testPartitionScores() { - ml::model::CAnomalyDetectorModelConfig modelConfig = - ml::model::CAnomalyDetectorModelConfig::defaultConfig(); - - std::ostringstream sstream; - { - ml::core::CJsonOutputStreamWrapper outputStream(sstream); - ml::api::CJsonOutputWriter writer("job", outputStream); - - std::string emptyString; - ml::api::CHierarchicalResultsWriter::TStoredStringPtrStoredStringPtrPrDoublePrVec influences; - - std::string partitionFieldName("part1"); - - for (int i = 0; i < 4; ++i) { - // For the first iteration use an empty string for the value - std::string partitionFieldValue; - if (i > 0) { - partitionFieldValue = 'p' + ml::core::CStringUtils::typeToString(i); - } - ml::api::CHierarchicalResultsWriter::SResults result( - ml::api::CHierarchicalResultsWriter::E_PartitionResult, - partitionFieldName, partitionFieldValue, emptyString, - emptyString, emptyString, 1, emptyString, emptyString, 42.0, 79, - TDouble1Vec(1, 6953.0), TDouble1Vec(1, 10090.0), 0.0, - double(i), // normalised anomaly score - 0.1, emptyString, influences, false, true, 1, 100, EMPTY_STRING_LIST); - - writer.acceptResult(result); - } - - writer.endOutputBatch(false, 1ul); - } - - rapidjson::Document doc; - doc.Parse(sstream.str().c_str()); - - LOG_DEBUG(<< sstream.str()); - - const rapidjson::Value& bucketWrapper = doc[rapidjson::SizeType(0)]; - CPPUNIT_ASSERT(bucketWrapper.HasMember("bucket")); - const rapidjson::Value& bucket = bucketWrapper["bucket"]; - CPPUNIT_ASSERT(bucket.HasMember("partition_scores")); - const rapidjson::Value& partitionScores = bucket["partition_scores"]; - - CPPUNIT_ASSERT(partitionScores.IsArray()); - CPPUNIT_ASSERT_EQUAL(rapidjson::SizeType(4), partitionScores.Size()); - - for (rapidjson::SizeType i = 0; i < partitionScores.Size(); ++i) { - const rapidjson::Value& pDoc = partitionScores[i]; - CPPUNIT_ASSERT(pDoc.IsObject()); - CPPUNIT_ASSERT(pDoc.HasMember("probability")); - CPPUNIT_ASSERT_DOUBLES_EQUAL(0.1, pDoc["probability"].GetDouble(), 0.01); - CPPUNIT_ASSERT(pDoc.HasMember("record_score")); - CPPUNIT_ASSERT_DOUBLES_EQUAL(double(i), pDoc["record_score"].GetDouble(), 0.01); - CPPUNIT_ASSERT(pDoc.HasMember("initial_record_score")); - CPPUNIT_ASSERT_DOUBLES_EQUAL(double(i), - pDoc["initial_record_score"].GetDouble(), 0.01); - - CPPUNIT_ASSERT(pDoc.HasMember("partition_field_name")); - CPPUNIT_ASSERT_EQUAL(std::string("part1"), - std::string(pDoc["partition_field_name"].GetString())); - std::string fieldValue; - if (i > 0) { - fieldValue = 'p' + ml::core::CStringUtils::typeToString(i); - } - CPPUNIT_ASSERT(pDoc.HasMember("partition_field_value")); - CPPUNIT_ASSERT_EQUAL( - fieldValue, std::string(pDoc["partition_field_value"].GetString())); - } -} - void CJsonOutputWriterTest::testReportMemoryUsage() { std::ostringstream sstream; { diff --git a/lib/api/unittest/CJsonOutputWriterTest.h b/lib/api/unittest/CJsonOutputWriterTest.h index 2d61a299ad..e4a513944e 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.h +++ b/lib/api/unittest/CJsonOutputWriterTest.h @@ -22,7 +22,6 @@ class CJsonOutputWriterTest : public CppUnit::TestFixture { void testWriteInfluencers(); void testWriteInfluencersWithLimit(); void testPersistNormalizer(); - void testPartitionScores(); void testReportMemoryUsage(); void testWriteScheduledEvent(); void testThroughputWithScopedAllocator(); diff --git a/lib/model/CAnomalyDetectorModelConfig.cc b/lib/model/CAnomalyDetectorModelConfig.cc index c93fdf6c8c..d7b8b09499 100644 --- a/lib/model/CAnomalyDetectorModelConfig.cc +++ b/lib/model/CAnomalyDetectorModelConfig.cc @@ -173,8 +173,7 @@ CAnomalyDetectorModelConfig::CAnomalyDetectorModelConfig() m_NoiseMultiplier(DEFAULT_NOISE_MULTIPLIER), m_NormalizedScoreKnotPoints(boost::begin(DEFAULT_NORMALIZED_SCORE_KNOT_POINTS), boost::end(DEFAULT_NORMALIZED_SCORE_KNOT_POINTS)), - m_PerPartitionNormalisation(false), m_DetectionRules(EMPTY_RULES_MAP), - m_ScheduledEvents(EMPTY_EVENTS) { + m_DetectionRules(EMPTY_RULES_MAP), m_ScheduledEvents(EMPTY_EVENTS) { for (std::size_t i = 0u; i < model_t::NUMBER_AGGREGATION_STYLES; ++i) { for (std::size_t j = 0u; j < model_t::NUMBER_AGGREGATION_PARAMS; ++j) { m_AggregationStyleParams[i][j] = DEFAULT_AGGREGATION_STYLE_PARAMS[i][j]; @@ -732,14 +731,6 @@ CAnomalyDetectorModelConfig::normalizedScoreKnotPoints() const { return m_NormalizedScoreKnotPoints; } -bool CAnomalyDetectorModelConfig::perPartitionNormalization() const { - return m_PerPartitionNormalisation; -} - -void CAnomalyDetectorModelConfig::perPartitionNormalization(bool value) { - m_PerPartitionNormalisation = value; -} - void CAnomalyDetectorModelConfig::detectionRules(TIntDetectionRuleVecUMapCRef detectionRules) { m_DetectionRules = detectionRules; } @@ -769,7 +760,6 @@ const std::string MAXIMUM_ANOMALOUS_PROBABILITY_PROPERTY("maximumanomalousprobab const std::string NOISE_PERCENTILE_PROPERTY("noisepercentile"); const std::string NOISE_MULTIPLIER_PROPERTY("noisemultiplier"); const std::string NORMALIZED_SCORE_KNOT_POINTS("normalizedscoreknotpoints"); -const std::string PER_PARTITION_NORMALIZATION_PROPERTY("perPartitionNormalization"); } bool CAnomalyDetectorModelConfig::processStanza(const boost::property_tree::ptree& propertyTree) { @@ -1007,8 +997,6 @@ bool CAnomalyDetectorModelConfig::processStanza(const boost::property_tree::ptre } points.emplace_back(100.0, 100.0); this->normalizedScoreKnotPoints(points); - } else if (propName == PER_PARTITION_NORMALIZATION_PROPERTY) { - } else { LOG_WARN(<< "Ignoring unknown property " << propName); } diff --git a/lib/model/CHierarchicalResultsNormalizer.cc b/lib/model/CHierarchicalResultsNormalizer.cc index 13cffb0e25..4cf620baa6 100644 --- a/lib/model/CHierarchicalResultsNormalizer.cc +++ b/lib/model/CHierarchicalResultsNormalizer.cc @@ -110,8 +110,7 @@ void CHierarchicalResultsNormalizer::visit(const CHierarchicalResults& /*results bool pivot) { CNormalizerFactory factory(m_ModelConfig); TNormalizerPtrVec normalizers; - this->elements(node, pivot, factory, normalizers, - m_ModelConfig.perPartitionNormalization()); + this->elements(node, pivot, factory, normalizers); if (normalizers.empty()) { return; diff --git a/lib/model/unittest/CAnomalyDetectorModelConfigTest.cc b/lib/model/unittest/CAnomalyDetectorModelConfigTest.cc index 1534258147..e1de2a01a9 100644 --- a/lib/model/unittest/CAnomalyDetectorModelConfigTest.cc +++ b/lib/model/unittest/CAnomalyDetectorModelConfigTest.cc @@ -116,7 +116,6 @@ void CAnomalyDetectorModelConfigTest::testNormal() { CPPUNIT_ASSERT_EQUAL( std::string("[(0, 0), (70, 1.5), (85, 1.6), (90, 1.7), (95, 2), (97, 10), (98, 20), (99.5, 50), (100, 100)]"), core::CContainerPrinter::print(config.normalizedScoreKnotPoints())); - CPPUNIT_ASSERT_EQUAL(false, config.perPartitionNormalization()); } { CAnomalyDetectorModelConfig config = CAnomalyDetectorModelConfig::defaultConfig(); @@ -140,7 +139,6 @@ void CAnomalyDetectorModelConfigTest::testNormal() { config.factory(1, function_t::E_PopulationRare).get())); CPPUNIT_ASSERT(dynamic_cast( config.factory(CSearchKey::simpleCountKey()).get())); - CPPUNIT_ASSERT_EQUAL(false, config.perPartitionNormalization()); } } From 2bc03bdb4c3a102ef449587b88e8e74a6823c62e Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Fri, 17 Aug 2018 10:21:50 +0100 Subject: [PATCH 2/2] Further cleanup of perPartitionNormaliztion --- include/model/CHierarchicalResultsLevelSet.h | 31 ++++++-------- .../CHierarchicalResultsLevelSetTest.cc | 42 ++++--------------- .../CHierarchicalResultsLevelSetTest.h | 2 +- 3 files changed, 22 insertions(+), 53 deletions(-) diff --git a/include/model/CHierarchicalResultsLevelSet.h b/include/model/CHierarchicalResultsLevelSet.h index ea7b99f168..5b9ff72f50 100644 --- a/include/model/CHierarchicalResultsLevelSet.h +++ b/include/model/CHierarchicalResultsLevelSet.h @@ -162,11 +162,7 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor { //! Get and possibly add a normalizer for \p node. template - void elements(const TNode& node, - bool pivot, - const FACTORY& factory, - TTypePtrVec& result, - bool distinctLeavesPerPartition = false) { + void elements(const TNode& node, bool pivot, const FACTORY& factory, TTypePtrVec& result) { result.clear(); if (this->isSimpleCount(node)) { return; @@ -193,39 +189,38 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor { return; } - std::string partitionKey = distinctLeavesPerPartition - ? *node.s_Spec.s_PartitionFieldName + - *node.s_Spec.s_PartitionFieldValue - : *node.s_Spec.s_PartitionFieldName; - if (this->isLeaf(node)) { - TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName, - *node.s_Spec.s_FunctionName, - *node.s_Spec.s_ValueFieldName); + TWord word = ms_Dictionary.word( + *node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PersonFieldName, + *node.s_Spec.s_FunctionName, *node.s_Spec.s_ValueFieldName); TWordTypePrVecItr i = element(m_LeafSet, word); if (i == m_LeafSet.end() || i->first != word) { i = m_LeafSet.insert( - i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName, + i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName, + *node.s_Spec.s_PersonFieldName, *node.s_Spec.s_FunctionName, *node.s_Spec.s_ValueFieldName))); } result.push_back(&i->second); } if (this->isPerson(node)) { - TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName); + TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName, + *node.s_Spec.s_PersonFieldName); TWordTypePrVecItr i = element(m_PersonSet, word); if (i == m_PersonSet.end() || i->first != word) { i = m_PersonSet.insert( - i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName))); + i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName, + *node.s_Spec.s_PersonFieldName))); } result.push_back(&i->second); } if (this->isPartition(node)) { - TWord word = ms_Dictionary.word(partitionKey); + TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName); TWordTypePrVecItr i = element(m_PartitionSet, word); if (i == m_PartitionSet.end() || i->first != word) { - i = m_PartitionSet.insert(i, TWordTypePr(word, factory.make(partitionKey))); + i = m_PartitionSet.insert( + i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName))); } result.push_back(&i->second); } diff --git a/lib/model/unittest/CHierarchicalResultsLevelSetTest.cc b/lib/model/unittest/CHierarchicalResultsLevelSetTest.cc index 766089ff48..0478612433 100644 --- a/lib/model/unittest/CHierarchicalResultsLevelSetTest.cc +++ b/lib/model/unittest/CHierarchicalResultsLevelSetTest.cc @@ -17,8 +17,8 @@ CppUnit::Test* CHierarchicalResultsLevelSetTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CHierarchicalResultsLevelSetTest"); suiteOfTests->addTest(new CppUnit::TestCaller( - "CHierarchicalResultsLevelSetTest::testElementsWithPerPartitionNormalisation", - &CHierarchicalResultsLevelSetTest::testElementsWithPerPartitionNormalisation)); + "CHierarchicalResultsLevelSetTest::testElements", + &CHierarchicalResultsLevelSetTest::testElements)); return suiteOfTests; } @@ -66,7 +66,7 @@ void print(const TestNode* node) { std::cout << "'" << node->s_Name << "'" << std::endl; } -void CHierarchicalResultsLevelSetTest::testElementsWithPerPartitionNormalisation() { +void CHierarchicalResultsLevelSetTest::testElements() { // This is intentionally NOT an empty string from the string store, but // instead a completely separate empty string, such that its pointer will be // different to other empty string pointers. (In general, if you need @@ -102,35 +102,9 @@ void CHierarchicalResultsLevelSetTest::testElementsWithPerPartitionNormalisation std::vector result; - // without per partition normalization - { - CConcreteHierarchicalResultsLevelSet levelSet(root); - levelSet.elements(node, false, CTestNodeFactory(), result, false); - std::for_each(result.begin(), result.end(), print); - CPPUNIT_ASSERT_EQUAL(size_t(1), result.size()); - CPPUNIT_ASSERT_EQUAL(std::string("pA"), result[0]->s_Name); - } - - // with per partition normalization - { - CConcreteHierarchicalResultsLevelSet levelSet(root); - levelSet.elements(node, false, CTestNodeFactory(), result, true); - - CPPUNIT_ASSERT_EQUAL(size_t(1), result.size()); - CPPUNIT_ASSERT_EQUAL(std::string("pAv1"), result[0]->s_Name); - - ml::model::hierarchical_results_detail::SResultSpec specB; - specB.s_PartitionFieldName = PARTITION_B; - specB.s_PartitionFieldValue = PARTITION_VALUE_1; - - CConcreteHierarchicalResultsLevelSet::TNode nodeB(specB, emptyAnnotatedProb); - nodeB.s_Parent = &parent; - nodeB.s_Children.push_back(&child); - - levelSet.elements(nodeB, false, CTestNodeFactory(), result, true); - - std::for_each(result.begin(), result.end(), print); - CPPUNIT_ASSERT_EQUAL(size_t(1), result.size()); - CPPUNIT_ASSERT_EQUAL(std::string("pBv1"), result[0]->s_Name); - } + CConcreteHierarchicalResultsLevelSet levelSet(root); + levelSet.elements(node, false, CTestNodeFactory(), result); + std::for_each(result.begin(), result.end(), print); + CPPUNIT_ASSERT_EQUAL(size_t(1), result.size()); + CPPUNIT_ASSERT_EQUAL(std::string("pA"), result[0]->s_Name); } diff --git a/lib/model/unittest/CHierarchicalResultsLevelSetTest.h b/lib/model/unittest/CHierarchicalResultsLevelSetTest.h index e710fc4828..ab446be374 100644 --- a/lib/model/unittest/CHierarchicalResultsLevelSetTest.h +++ b/lib/model/unittest/CHierarchicalResultsLevelSetTest.h @@ -10,7 +10,7 @@ class CHierarchicalResultsLevelSetTest : public CppUnit::TestFixture { public: - void testElementsWithPerPartitionNormalisation(); + void testElements(); static CppUnit::Test* suite(); };