Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Remove old per-partition normalization code #184

Merged
merged 3 commits into from
Aug 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions bin/autodetect/CCmdLineParser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ bool CCmdLineParser::parse(int argc,
bool& memoryUsage,
std::size_t& bucketResultsDelay,
bool& multivariateByFields,
bool& perPartitionNormalization,
TStrVec& clauseTokens) {
try {
boost::program_options::options_description desc(DESCRIPTION);
Expand Down Expand Up @@ -116,8 +115,6 @@ bool CCmdLineParser::parse(int argc,
"The numer of half buckets to store before choosing which overlapping bucket has the biggest anomaly")
("multivariateByFields",
"Optional flag to enable multi-variate analysis of correlated by fields")
("perPartitionNormalization",
"Optional flag to enable per partition normalization")
;
// clang-format on

Expand Down Expand Up @@ -231,9 +228,6 @@ bool CCmdLineParser::parse(int argc,
if (vm.count("multivariateByFields") > 0) {
multivariateByFields = true;
}
if (vm.count("perPartitionNormalization") > 0) {
perPartitionNormalization = true;
}

boost::program_options::collect_unrecognized(
parsed.options, boost::program_options::include_positional)
Expand Down
1 change: 0 additions & 1 deletion bin/autodetect/CCmdLineParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ class CCmdLineParser {
bool& memoryUsage,
std::size_t& bucketResultsDelay,
bool& multivariateByFields,
bool& perPartitionNormalization,
TStrVec& clauseTokens);

private:
Expand Down
4 changes: 1 addition & 3 deletions bin/autodetect/Main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ int main(int argc, char** argv) {
bool memoryUsage(false);
std::size_t bucketResultsDelay(0);
bool multivariateByFields(false);
bool perPartitionNormalization(false);
TStrVec clauseTokens;
if (ml::autodetect::CCmdLineParser::parse(
argc, argv, limitConfigFile, modelConfigFile, fieldConfigFile,
Expand All @@ -98,7 +97,7 @@ int main(int argc, char** argv) {
maxQuantileInterval, inputFileName, isInputFileNamedPipe, outputFileName,
isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, persistFileName,
isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, bucketResultsDelay,
multivariateByFields, perPartitionNormalization, clauseTokens) == false) {
multivariateByFields, clauseTokens) == false) {
return EXIT_FAILURE;
}

Expand Down Expand Up @@ -146,7 +145,6 @@ int main(int argc, char** argv) {
ml::model::CAnomalyDetectorModelConfig::defaultConfig(
bucketSpan, summaryMode, summaryCountFieldName, latency,
bucketResultsDelay, multivariateByFields);
modelConfig.perPartitionNormalization(perPartitionNormalization);
modelConfig.detectionRules(ml::model::CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMapCRef(
fieldConfig.detectionRules()));
modelConfig.scheduledEvents(ml::model::CAnomalyDetectorModelConfig::TStrDetectionRulePrVecCRef(
Expand Down
8 changes: 1 addition & 7 deletions bin/normalize/CCmdLineParser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ bool CCmdLineParser::parse(int argc,
bool& isOutputFileNamedPipe,
std::string& quantilesState,
bool& deleteStateFiles,
bool& writeCsv,
bool& perPartitionNormalization) {
bool& writeCsv) {
try {
boost::program_options::options_description desc(DESCRIPTION);
// clang-format off
Expand Down Expand Up @@ -60,8 +59,6 @@ bool CCmdLineParser::parse(int argc,
"If this flag is set then delete the normalizer state files once they have been read")
("writeCsv",
"Write the results in CSV format (default is lineified JSON)")
("perPartitionNormalization",
"Optional flag to enable per partition normalization")
;
// clang-format on

Expand Down Expand Up @@ -114,9 +111,6 @@ bool CCmdLineParser::parse(int argc,
if (vm.count("writeCsv") > 0) {
writeCsv = true;
}
if (vm.count("perPartitionNormalization") > 0) {
perPartitionNormalization = true;
}
} catch (std::exception& e) {
std::cerr << "Error processing command line: " << e.what() << std::endl;
return false;
Expand Down
3 changes: 1 addition & 2 deletions bin/normalize/CCmdLineParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ class CCmdLineParser {
bool& isOutputFileNamedPipe,
std::string& quantilesState,
bool& deleteStateFiles,
bool& writeCsv,
bool& perPartitionNormalization);
bool& writeCsv);

private:
static const std::string DESCRIPTION;
Expand Down
9 changes: 3 additions & 6 deletions bin/normalize/Main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,10 @@ int main(int argc, char** argv) {
std::string quantilesStateFile;
bool deleteStateFiles(false);
bool writeCsv(false);
bool perPartitionNormalization(false);
if (ml::normalize::CCmdLineParser::parse(
argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan,
lengthEncodedInput, inputFileName, isInputFileNamedPipe,
outputFileName, isOutputFileNamedPipe, quantilesStateFile,
deleteStateFiles, writeCsv, perPartitionNormalization) == false) {
argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan, lengthEncodedInput,
inputFileName, isInputFileNamedPipe, outputFileName, isOutputFileNamedPipe,
quantilesStateFile, deleteStateFiles, writeCsv) == false) {
return EXIT_FAILURE;
}

Expand Down Expand Up @@ -93,7 +91,6 @@ int main(int argc, char** argv) {
LOG_FATAL(<< "Ml model config file '" << modelConfigFile << "' could not be loaded");
return EXIT_FAILURE;
}
modelConfig.perPartitionNormalization(perPartitionNormalization);

// There's a choice of input and output formats for the numbers to be normalised
using TInputParserUPtr = std::unique_ptr<ml::api::CInputParser>;
Expand Down
10 changes: 1 addition & 9 deletions include/api/CHierarchicalResultsWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
using TStr1Vec = core::CSmallVector<std::string, 1>;

public:
enum EResultType {
E_SimpleCountResult,
E_PopulationResult,
E_PartitionResult,
E_Result
};
enum EResultType { E_SimpleCountResult, E_PopulationResult, E_Result };
//! Type which wraps up the results of anomaly detection.
struct API_EXPORT SResults {
//! Construct for population results
Expand Down Expand Up @@ -168,9 +163,6 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
//! pivot.
void writePivotResult(const model::CHierarchicalResults& results, const TNode& node);

//! Write partition result if \p node is a partition level result
void writePartitionResult(const model::CHierarchicalResults& results, const TNode& node);

//! Write out a simple count result if \p node is simple
//! count.
void writeSimpleCountResult(const TNode& node);
Expand Down
7 changes: 0 additions & 7 deletions include/api/CJsonOutputWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {
// when the number to write is limited
double s_LowestBucketInfluencerScore;

//! Partition scores
TDocumentWeakPtrVec s_PartitionScoreDocuments;

//! scheduled event descriptions
TStr1Vec s_ScheduledEventDescriptions;
};
Expand Down Expand Up @@ -304,10 +301,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {
void addInfluences(const CHierarchicalResultsWriter::TStoredStringPtrStoredStringPtrPrDoublePrVec& influenceResults,
TDocumentWeakPtr weakDoc);

//! Write partition score & probability
void addPartitionScores(const CHierarchicalResultsWriter::TResults& results,
TDocumentWeakPtr weakDoc);

private:
//! The job ID
std::string m_JobId;
Expand Down
9 changes: 0 additions & 9 deletions include/api/CResultNormalizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,6 @@ class API_EXPORT CResultNormalizer {
std::string& valueFieldName,
double& probability);

bool parseDataFields(const TStrStrUMap& dataRowFields,
std::string& level,
std::string& partition,
std::string& partitionValue,
std::string& person,
std::string& function,
std::string& valueFieldName,
double& probability);

template<typename T>
bool parseDataField(const TStrStrUMap& dataRowFields,
const std::string& fieldName,
Expand Down
9 changes: 0 additions & 9 deletions include/model/CAnomalyDetectorModelConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
const TDoubleDoublePrVec& normalizedScoreKnotPoints() const;
//@}

//! Check if we should create one normalizer per partition field value.
bool perPartitionNormalization() const;

//! Set whether we should create one normalizer per partition field value.
void perPartitionNormalization(bool value);

//! Sets the reference to the detection rules map
void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules);

Expand Down Expand Up @@ -494,9 +488,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
//! and the normalized anomaly score with these knot points.
//! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details.
TDoubleDoublePrVec m_NormalizedScoreKnotPoints;

//! If true then create one normalizer per partition field value.
bool m_PerPartitionNormalisation;
//@}

//! A reference to the map containing detection rules per
Expand Down
31 changes: 13 additions & 18 deletions include/model/CHierarchicalResultsLevelSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,7 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor {

//! Get and possibly add a normalizer for \p node.
template<typename FACTORY>
void elements(const TNode& node,
bool pivot,
const FACTORY& factory,
TTypePtrVec& result,
bool distinctLeavesPerPartition = false) {
void elements(const TNode& node, bool pivot, const FACTORY& factory, TTypePtrVec& result) {
result.clear();
if (this->isSimpleCount(node)) {
return;
Expand All @@ -193,39 +189,38 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor {
return;
}

std::string partitionKey = distinctLeavesPerPartition
? *node.s_Spec.s_PartitionFieldName +
*node.s_Spec.s_PartitionFieldValue
: *node.s_Spec.s_PartitionFieldName;

if (this->isLeaf(node)) {
TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName,
*node.s_Spec.s_FunctionName,
*node.s_Spec.s_ValueFieldName);
TWord word = ms_Dictionary.word(
*node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PersonFieldName,
*node.s_Spec.s_FunctionName, *node.s_Spec.s_ValueFieldName);
TWordTypePrVecItr i = element(m_LeafSet, word);
if (i == m_LeafSet.end() || i->first != word) {
i = m_LeafSet.insert(
i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName,
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName,
*node.s_Spec.s_PersonFieldName,
*node.s_Spec.s_FunctionName,
*node.s_Spec.s_ValueFieldName)));
}
result.push_back(&i->second);
}
if (this->isPerson(node)) {
TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName);
TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName,
*node.s_Spec.s_PersonFieldName);
TWordTypePrVecItr i = element(m_PersonSet, word);
if (i == m_PersonSet.end() || i->first != word) {
i = m_PersonSet.insert(
i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName)));
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName,
*node.s_Spec.s_PersonFieldName)));
}
result.push_back(&i->second);
}
if (this->isPartition(node)) {
TWord word = ms_Dictionary.word(partitionKey);
TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName);

TWordTypePrVecItr i = element(m_PartitionSet, word);
if (i == m_PartitionSet.end() || i->first != word) {
i = m_PartitionSet.insert(i, TWordTypePr(word, factory.make(partitionKey)));
i = m_PartitionSet.insert(
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName)));
}
result.push_back(&i->second);
}
Expand Down
29 changes: 0 additions & 29 deletions lib/api/CHierarchicalResultsWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ void CHierarchicalResultsWriter::visit(const model::CHierarchicalResults& result
} else {
this->writePopulationResult(results, node);
this->writeIndividualResult(results, node);
this->writePartitionResult(results, node);
this->writeSimpleCountResult(node);
}
}
Expand Down Expand Up @@ -258,34 +257,6 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
}

void CHierarchicalResultsWriter::writePartitionResult(const model::CHierarchicalResults& results,
const TNode& node) {
if (!m_ModelConfig.perPartitionNormalization() || this->isSimpleCount(node) ||
this->isPopulation(node) || !this->isPartition(node) ||
!this->shouldWriteResult(m_Limits, results, node, false)) {
return;
}

model_t::EFeature feature =
node.s_AnnotatedProbability.s_AttributeProbabilities.empty()
? model_t::E_IndividualCountByBucketAndPerson
: node.s_AnnotatedProbability.s_AttributeProbabilities[0].s_Feature;

TDouble1Vec emptyDoubleVec;

m_ResultWriterFunc(TResults(
E_PartitionResult, *node.s_Spec.s_PartitionFieldName,
*node.s_Spec.s_PartitionFieldValue, *node.s_Spec.s_ByFieldName,
*node.s_Spec.s_PersonFieldValue, EMPTY_STRING, node.s_BucketStartTime,
*node.s_Spec.s_FunctionName, model_t::outputFunctionName(feature),
node.s_AnnotatedProbability.s_BaselineBucketCount,
node.s_AnnotatedProbability.s_CurrentBucketCount, emptyDoubleVec, emptyDoubleVec,
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
}

void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
const TNode& node) {
if (this->isSimpleCount(node) ||
Expand Down
47 changes: 0 additions & 47 deletions lib/api/CJsonOutputWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ const std::string EXAMPLES("examples");
const std::string BUCKET_SPAN("bucket_span");
const std::string PROCESSING_TIME("processing_time_ms");
const std::string TIME_INFLUENCER("bucket_time");
const std::string PARTITION_SCORES("partition_scores");
const std::string SCHEDULED_EVENTS("scheduled_events");
const std::string QUANTILES("quantiles");

Expand Down Expand Up @@ -191,14 +190,6 @@ bool CJsonOutputWriter::acceptResult(const CHierarchicalResultsWriter::TResults&
return true;
}

if (results.s_ResultType == CHierarchicalResultsWriter::E_PartitionResult) {
TDocumentWeakPtr partitionDoc = m_Writer.makeStorableDoc();
this->addPartitionScores(results, partitionDoc);
bucketData.s_PartitionScoreDocuments.push_back(partitionDoc);

return true;
}

++bucketData.s_RecordCount;

TDocumentWeakPtrIntPrVec& detectorDocumentsToWrite = bucketData.s_DocumentsToWrite;
Expand Down Expand Up @@ -513,26 +504,6 @@ void CJsonOutputWriter::writeBucket(bool isInterim,
m_Writer.EndArray();
}

if (!bucketData.s_PartitionScoreDocuments.empty()) {
// Write the array of partition-anonaly score pairs
m_Writer.String(PARTITION_SCORES);
m_Writer.StartArray();
for (TDocumentWeakPtrVecItr partitionScoresIter =
bucketData.s_PartitionScoreDocuments.begin();
partitionScoresIter != bucketData.s_PartitionScoreDocuments.end();
++partitionScoresIter) {
TDocumentWeakPtr weakDoc = *partitionScoresIter;
TDocumentPtr docPtr = weakDoc.lock();
if (!docPtr) {
LOG_ERROR(<< "Inconsistent program state. JSON document unavailable.");
continue;
}

m_Writer.write(*docPtr);
}
m_Writer.EndArray();
}

m_Writer.String(PROCESSING_TIME);
m_Writer.Uint64(bucketProcessingTime);

Expand Down Expand Up @@ -816,24 +787,6 @@ void CJsonOutputWriter::addInfluencerFields(bool isBucketInfluencer,
}
}

void CJsonOutputWriter::addPartitionScores(const CHierarchicalResultsWriter::TResults& results,
TDocumentWeakPtr weakDoc) {
TDocumentPtr docPtr = weakDoc.lock();
if (!docPtr) {
LOG_ERROR(<< "Inconsistent program state. JSON document unavailable.");
return;
}

m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_NAME,
results.s_PartitionFieldName, *docPtr);
m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_VALUE,
results.s_PartitionFieldValue, *docPtr, true);
m_Writer.addDoubleFieldToObj(INITIAL_RECORD_SCORE,
results.s_NormalizedAnomalyScore, *docPtr);
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
}

void CJsonOutputWriter::limitNumberRecords(size_t count) {
m_RecordOutputLimit = count;
}
Expand Down
Loading