Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Reduce false positives for the periodic component test for anomaly detection #1177

Merged
merged 9 commits into from May 4, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 2 additions & 5 deletions docs/CHANGELOG.asciidoc
Expand Up @@ -62,8 +62,8 @@
operations. (See {ml-pull}1142[#1142].)
* Fix spurious anomalies for count and sum functions after no data are received for long
periods of time. (See {ml-pull}1158[#1158].)

== {es} version 7.8.0
* Improve false positive rates from periodicity test for time series anomaly detection.
(See {ml-pull}1177[#1177].)

=== Bug Fixes

Expand All @@ -72,9 +72,6 @@
(See {ml-pull}1160[#1160], issue: {issue}55593[#55593].)
* Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167],
issue: {ml-issue}1130[#1130].)

=== Bug Fixes

* Fix underlying cause for "Failed to calculate splitting significance" log errors.
(See {ml-pull}1157[#1157].)

Expand Down
135 changes: 81 additions & 54 deletions lib/maths/CPeriodicityHypothesisTests.cc
Expand Up @@ -69,6 +69,38 @@ const std::size_t MINIMUM_REPEATS_TO_TEST_AMPLITUDE{4};
//! A high priority for components we want to take precendence.
double HIGH_PRIORITY{2.0};

//! \brief Used to compute conjunctions of smooth x < t or x > t
//! conditions.
class CSmoothCondition {
public:
explicit CSmoothCondition(double value = 0.0) : m_Value{value} {}

operator bool() const { return m_Value > 1.0; }
bool operator<(const CSmoothCondition& rhs) const {
return m_Value < rhs.m_Value;
}

double p() const { return m_Value; }
tveasey marked this conversation as resolved.
Show resolved Hide resolved

friend CSmoothCondition operator&&(const CSmoothCondition& lhs,
const CSmoothCondition& rhs) {
return CSmoothCondition{lhs.m_Value * rhs.m_Value};
}

private:
double m_Value;
};

//! Smoothly check if \p value is greater than \p threshold.
CSmoothCondition softGreaterThan(double value, double threshold, double margin) {
return CSmoothCondition{2.0 * CTools::logisticFunction(value, margin, threshold, +1.0)};
}

//! Smoothly check if \p value is less than \p threshold.
CSmoothCondition softLessThan(double value, double threshold, double margin) {
return CSmoothCondition{2.0 * CTools::logisticFunction(value, margin, threshold, -1.0)};
}

//! \brief Accumulates the minimum amplitude.
class CMinAmplitude {
public:
Expand Down Expand Up @@ -1231,24 +1263,25 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const
TMinAccumulator vmin;
TMinAccumulator DFmin;
for (const auto& summary : summaries) {
vmin.add(varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold);
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 + CONFIDENCE_INTERVAL / 2.0)};
vmin.add(v == summary.s_VarianceThreshold ? 1.0 : v / summary.s_VarianceThreshold);
DFmin.add(summary.s_DF);
}

TMinAccumulator pmin;
for (const auto& summary : summaries) {
double v{varianceAtPercentile(summary.s_V, summary.s_DF,
50.0 - CONFIDENCE_INTERVAL / 2.0) /
summary.s_VarianceThreshold / vmin[0]};
double R{summary.s_R / summary.s_AutocorrelationThreshold};
double DF{summary.s_DF / DFmin[0]};
double p{CTools::logisticFunction(v, 0.2, 1.0, -1.0) *
CTools::logisticFunction(R, 0.2, 1.0, +1.0) *
CTools::logisticFunction(DF, 0.2, 1.0, +1.0) *
CTools::logisticFunction(summary.s_TrendSegments, 0.3, 0.0, -1.0) *
CTools::logisticFunction(summary.s_ScaleSegments, 0.3, 0.0, -1.0)};
50.0 - CONFIDENCE_INTERVAL / 2.0)};
v = v == summary.s_VarianceThreshold * vmin[0]
? 1.0
: v / summary.s_VarianceThreshold / vmin[0];
double p{(softLessThan(v, 1.0, 0.2) &&
softGreaterThan(summary.s_R, summary.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(summary.s_DF / DFmin[0], 1.0, 0.2) &&
softLessThan(summary.s_TrendSegments, 0.0, 0.3) &&
softLessThan(summary.s_ScaleSegments, 0.0, 0.3))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice 👍

.p()};
LOG_TRACE(<< "p = " << p);
if (pmin.add(-p)) {
result = summary.s_H;
Expand Down Expand Up @@ -2062,7 +2095,7 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
return CBasicStatistics::mean(result);
};

double p{0.0};
CSmoothCondition correlationCondition;
double R{-1.0};

TFloatMeanAccumulatorVec partitionValues;
Expand All @@ -2084,27 +2117,23 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
}

double meanRepeats{calculateMeanRepeats(window, period_)};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " relative mean repeats = " << relativeMeanRepeats);

p = std::max(
p, CTools::logisticFunction(RW / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0));
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

correlationCondition =
std::max(correlationCondition,
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2));
R = std::max(R, RW);
}

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeats{calculateMeanRepeats({{0, windowLength}}, repeat)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
p *= CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
LOG_TRACE(<< " p(partition) = " << p);

if (p >= 1.0) {

if (correlationCondition && softGreaterThan(logSignificance, 1.0, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0) : softLessThan(v1 / vt, 1.0, 0.1))) {
stats.s_StartOfPartition = startOfPartition;
stats.s_R0 = R;
return true;
Expand Down Expand Up @@ -2145,7 +2174,6 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
return CBasicStatistics::mean(result);
}();
LOG_TRACE(<< " mean repeats = " << meanRepeats);

// We're trading off:
// 1) The significance of the variance reduction,
Expand All @@ -2159,22 +2187,24 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
// is equal to the threshold, the variance reduction is equal to the
// threshold and we've observed three periods on average.

double relativeLogSignificance{
double logSignificance{
CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
double segmentsPerRepeat{(stats.s_TrendSegments +
std::max(static_cast<double>(segmentation.size()), 1.0) - 2.0) /
meanRepeats};
pVariance = CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
CTools::logisticFunction(R / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
(vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
: CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
double meanRepeatsPerSegment{
meanRepeats /
std::max(stats.s_TrendSegments + static_cast<double>(segmentation.size()), 1.0) /
MINIMUM_REPEATS_TO_TEST_VARIANCE};
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

auto condition = softGreaterThan(logSignificance, 1.0, 0.1) &&
softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
(vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0)
: softLessThan(v1 / vt, 0.1, 1.0)) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2);
pVariance = condition.p();
LOG_TRACE(<< " p(variance) = " << pVariance);

if (pVariance >= 1.0) {
if (condition) {
stats.s_R0 = R;
stats.s_Segmentation = segmentation;
return true;
Expand Down Expand Up @@ -2226,19 +2256,16 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,

// Trade off the test significance and the mean number of repeats
// we've observed.
double relativeLogSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double relativeMeanRepeats{
meanRepeats / static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double logSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
double minusLogPVariance{-CTools::fastLog(pVariance)};
double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
double pAmplitude{CTools::logisticFunction(relativeLogSignificance, 0.2, 1.0) *
CTools::logisticFunction(relativeMeanRepeats, 0.5, 1.0) *
CTools::logisticFunction(minusLogPVariance, 2.0, 0.0, -1.0) *
CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.0625};
LOG_TRACE(<< " p(amplitude) = " << pAmplitude);

if (pAmplitude >= 1.0) {
LOG_TRACE(<< " mean repeats per segment = " << meanRepeatsPerSegment);

if (softGreaterThan(logSignificance, 1.0, 0.2) &&
softLessThan(minusLogPVariance, 0.0, 2.0) &&
softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2)) {
stats.s_R0 = R;
return true;
}
Expand Down
18 changes: 12 additions & 6 deletions lib/maths/CTimeSeriesModel.cc
Expand Up @@ -1551,8 +1551,9 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples,
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -1656,6 +1657,7 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
// We can't properly handle periodicity in the variance of the rate if
// using a Poisson process so remove it from model detectio if we detect
// seasonality.
double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->removeModels(
maths::CPrior::CModelFilter().remove(maths::CPrior::E_Poisson));
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());
Expand All @@ -1665,7 +1667,8 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
[](double weight, const TFloatMeanAccumulator& sample) {
return weight + CBasicStatistics::count(sample);
})};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDoubleWeightsAry1Vec weights(1);
for (const auto& residual : residuals) {
double weight(CBasicStatistics::count(residual));
Expand Down Expand Up @@ -2862,8 +2865,9 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample
}
}

// Time order is not reliable, for example if the data are polled
// or for count feature, the times of all samples will be the same.
// Time order is not a total order, for example if the data are polled
// the times of all samples will be the same. So break ties using the
// sample value.
TSizeVec timeorder(samples.size());
std::iota(timeorder.begin(), timeorder.end(), 0);
std::stable_sort(timeorder.begin(), timeorder.end(),
Expand Down Expand Up @@ -2965,6 +2969,7 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
// re-weight so that the total sample weight corresponds to the sample
// weight the model receives from a fixed (shortish) time interval.

double numberSamples{m_ResidualModel->numberSamples()};
m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());

if (residuals.size() > 0) {
Expand All @@ -2988,7 +2993,8 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
}

double Z{std::accumulate(weights.begin(), weights.end(), 0.0)};
double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
double weightScale{
std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
maths_t::TDouble10VecWeightsAry1Vec weight(1);
for (std::size_t i = 0; i < samples.size(); ++i) {
if (weights[i] > 0.0) {
Expand Down
24 changes: 13 additions & 11 deletions lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc
Expand Up @@ -179,7 +179,7 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
}

LOG_DEBUG(<< "Recall = " << TP / (TP + FN));
BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.99);
BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.98);
}

LOG_DEBUG(<< "");
Expand Down Expand Up @@ -249,7 +249,8 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
LOG_DEBUG(<< "result = " << result.print());
BOOST_TEST_REQUIRE(
(result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" ||
(result.print() == "{ 'daily' 'weekly' }" ||
result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" ||
result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"));
hypotheses = maths::CPeriodicityHypothesisTests();
hypotheses.initialize(0 /*startTime*/, HOUR, window, DAY);
Expand Down Expand Up @@ -322,11 +323,9 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
core_t::TTime time{timeseries[i].first};
if (time > lastTest + window) {
maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
const std::string& printedResult{result.print()};
LOG_DEBUG(<< "result = " << printedResult);
if (printedResult != "{ 'weekend daily' 'weekday daily' 'weekend weekly' }") {
BOOST_TEST_REQUIRE(printedResult == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }");
}
LOG_DEBUG(<< "result = " << result.print());
BOOST_TEST_REQUIRE((result.print() == "{ 'daily' 'weekly' }" ||
result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"));
hypotheses = maths::CPeriodicityHypothesisTests();
hypotheses.initialize(0 /*startTime*/, HOUR, window, DAY);
lastTest += window;
Expand Down Expand Up @@ -510,7 +509,9 @@ BOOST_AUTO_TEST_CASE(testWithSparseData) {
if (t >= 2) {
maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
LOG_DEBUG(<< "result = " << result.print());
BOOST_REQUIRE_EQUAL(std::string("{ 'daily' 'weekly' }"), result.print());
BOOST_TEST_REQUIRE(
(result.print() == "{ 'daily' 'weekly' }" ||
result.print() == std::string("{ 'weekend daily' 'weekday daily' }")));
}
}
}
Expand Down Expand Up @@ -666,8 +667,9 @@ BOOST_AUTO_TEST_CASE(testWithOutliers) {
maths::CPeriodicityHypothesisTestsResult result{
maths::testForPeriods(config, startTime, bucketLength, values)};
LOG_DEBUG(<< "result = " << result.print());
BOOST_TEST_REQUIRE(result.print() ==
std::string("{ 'weekend daily' 'weekday daily' 'weekend weekly' }"));
BOOST_TEST_REQUIRE(
(result.print() == std::string("{ 'daily' 'weekly' }") ||
result.print() == std::string("{ 'weekend daily' 'weekday daily' 'weekend weekly' }")));
}
}
}
Expand Down Expand Up @@ -947,7 +949,7 @@ BOOST_AUTO_TEST_CASE(testWithPiecewiseLinearTrend) {
}

LOG_DEBUG(<< "Recall = " << TP / (TP + FN));
BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.8);
BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.9);
}

BOOST_AUTO_TEST_SUITE_END()