elastic · tveasey · May 4, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -62,8 +62,8 @@
   operations. (See {ml-pull}1142[#1142].)
 * Fix spurious anomalies for count and sum functions after no data are received for long
   periods of time. (See {ml-pull}1158[#1158].)
-
-== {es} version 7.8.0
+* Improve false positive rates from periodicity test for time series anomaly detection.
+  (See {ml-pull}1177[#1177].)
 
 === Bug Fixes
 
@@ -72,9 +72,6 @@
   (See {ml-pull}1160[#1160], issue: {issue}55593[#55593].)
 * Make categorization respect the `model_memory_limit`. (See {ml-pull}1167[#1167],
   issue: {ml-issue}1130[#1130].)
-
-=== Bug Fixes
-
 * Fix underlying cause for "Failed to calculate splitting significance" log errors.
   (See {ml-pull}1157[#1157].)
 

diff --git a/lib/maths/CPeriodicityHypothesisTests.cc b/lib/maths/CPeriodicityHypothesisTests.cc
@@ -69,6 +69,38 @@ const std::size_t MINIMUM_REPEATS_TO_TEST_AMPLITUDE{4};
 //! A high priority for components we want to take precendence.
 double HIGH_PRIORITY{2.0};
 
+//! \brief Used to compute conjunctions of smooth x < t or x > t
+//! conditions.
+class CSmoothCondition {
+public:
+    explicit CSmoothCondition(double value = 0.0) : m_Value{value} {}
+
+    operator bool() const { return m_Value > 1.0; }
+    bool operator<(const CSmoothCondition& rhs) const {
+        return m_Value < rhs.m_Value;
+    }
+
+    double p() const { return m_Value; }
+
+    friend CSmoothCondition operator&&(const CSmoothCondition& lhs,
+                                       const CSmoothCondition& rhs) {
+        return CSmoothCondition{lhs.m_Value * rhs.m_Value};
+    }
+
+private:
+    double m_Value;
+};
+
+//! Smoothly check if \p value is greater than \p threshold.
+CSmoothCondition softGreaterThan(double value, double threshold, double margin) {
+    return CSmoothCondition{2.0 * CTools::logisticFunction(value, margin, threshold, +1.0)};
+}
+
+//! Smoothly check if \p value is less than \p threshold.
+CSmoothCondition softLessThan(double value, double threshold, double margin) {
+    return CSmoothCondition{2.0 * CTools::logisticFunction(value, margin, threshold, -1.0)};
+}
+
 //! \brief Accumulates the minimum amplitude.
 class CMinAmplitude {
 public:
@@ -1231,24 +1263,25 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec& hypotheses) const
         TMinAccumulator vmin;
         TMinAccumulator DFmin;
         for (const auto& summary : summaries) {
-            vmin.add(varianceAtPercentile(summary.s_V, summary.s_DF,
-                                          50.0 + CONFIDENCE_INTERVAL / 2.0) /
-                     summary.s_VarianceThreshold);
+            double v{varianceAtPercentile(summary.s_V, summary.s_DF,
+                                          50.0 + CONFIDENCE_INTERVAL / 2.0)};
+            vmin.add(v == summary.s_VarianceThreshold ? 1.0 : v / summary.s_VarianceThreshold);
             DFmin.add(summary.s_DF);
         }
 
         TMinAccumulator pmin;
         for (const auto& summary : summaries) {
             double v{varianceAtPercentile(summary.s_V, summary.s_DF,
-                                          50.0 - CONFIDENCE_INTERVAL / 2.0) /
-                     summary.s_VarianceThreshold / vmin[0]};
-            double R{summary.s_R / summary.s_AutocorrelationThreshold};
-            double DF{summary.s_DF / DFmin[0]};
-            double p{CTools::logisticFunction(v, 0.2, 1.0, -1.0) *
-                     CTools::logisticFunction(R, 0.2, 1.0, +1.0) *
-                     CTools::logisticFunction(DF, 0.2, 1.0, +1.0) *
-                     CTools::logisticFunction(summary.s_TrendSegments, 0.3, 0.0, -1.0) *
-                     CTools::logisticFunction(summary.s_ScaleSegments, 0.3, 0.0, -1.0)};
+                                          50.0 - CONFIDENCE_INTERVAL / 2.0)};
+            v = v == summary.s_VarianceThreshold * vmin[0]
+                    ? 1.0
+                    : v / summary.s_VarianceThreshold / vmin[0];
+            double p{(softLessThan(v, 1.0, 0.2) &&
+                      softGreaterThan(summary.s_R, summary.s_AutocorrelationThreshold, 0.1) &&
+                      softGreaterThan(summary.s_DF / DFmin[0], 1.0, 0.2) &&
+                      softLessThan(summary.s_TrendSegments, 0.0, 0.3) &&
+                      softLessThan(summary.s_ScaleSegments, 0.0, 0.3))
+                         .p()};
             LOG_TRACE(<< "p = " << p);
             if (pmin.add(-p)) {
                 result = summary.s_H;
@@ -2062,7 +2095,7 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
         return CBasicStatistics::mean(result);
     };
 
-    double p{0.0};
+    CSmoothCondition correlationCondition;
     double R{-1.0};
 
     TFloatMeanAccumulatorVec partitionValues;
@@ -2084,27 +2117,23 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec& partition
         }
 
         double meanRepeats{calculateMeanRepeats(window, period_)};
-        double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
-        LOG_TRACE(<< "  relative mean repeats = " << relativeMeanRepeats);
-
-        p = std::max(
-            p, CTools::logisticFunction(RW / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
-                   CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0));
+        double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
+                                     MINIMUM_REPEATS_TO_TEST_VARIANCE};
+        LOG_TRACE(<< "  mean repeats per segment = " << meanRepeatsPerSegment);
+
+        correlationCondition =
+            std::max(correlationCondition,
+                     softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
+                         softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2));
         R = std::max(R, RW);
     }
 
-    double relativeLogSignificance{
+    double logSignificance{
         CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
         LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
-    double meanRepeats{calculateMeanRepeats({{0, windowLength}}, repeat)};
-    double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
-    p *= CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
-         (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
-                  : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
-         CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
-    LOG_TRACE(<< "  p(partition) = " << p);
-
-    if (p >= 1.0) {
+
+    if (correlationCondition && softGreaterThan(logSignificance, 1.0, 0.1) &&
+        (vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0) : softLessThan(v1 / vt, 1.0, 0.1))) {
         stats.s_StartOfPartition = startOfPartition;
         stats.s_R0 = R;
         return true;
@@ -2145,7 +2174,6 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
         result.add(calculateRepeats(window, period_, m_BucketLength, buckets));
         return CBasicStatistics::mean(result);
     }();
-    LOG_TRACE(<< "  mean repeats = " << meanRepeats);
 
     // We're trading off:
     //   1) The significance of the variance reduction,
@@ -2159,22 +2187,24 @@ bool CPeriodicityHypothesisTests::testVariance(const TTimeTimePr2Vec& window,
     // is equal to the threshold, the variance reduction is equal to the
     // threshold and we've observed three periods on average.
 
-    double relativeLogSignificance{
+    double logSignificance{
         CTools::fastLog(CStatisticalTests::leftTailFTest(v1 / v0, df1, df0)) /
         LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
-    double relativeMeanRepeats{meanRepeats / MINIMUM_REPEATS_TO_TEST_VARIANCE};
-    double segmentsPerRepeat{(stats.s_TrendSegments +
-                              std::max(static_cast<double>(segmentation.size()), 1.0) - 2.0) /
-                             meanRepeats};
-    pVariance = CTools::logisticFunction(relativeLogSignificance, 0.1, 1.0) *
-                CTools::logisticFunction(R / stats.s_AutocorrelationThreshold, 0.15, 1.0) *
-                (vt > v1 ? CTools::logisticFunction(vt / v1, 1.0, 1.0, +1.0)
-                         : CTools::logisticFunction(v1 / vt, 0.1, 1.0, -1.0)) *
-                CTools::logisticFunction(relativeMeanRepeats, 0.25, 1.0) *
-                CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.03125;
+    double meanRepeatsPerSegment{
+        meanRepeats /
+        std::max(stats.s_TrendSegments + static_cast<double>(segmentation.size()), 1.0) /
+        MINIMUM_REPEATS_TO_TEST_VARIANCE};
+    LOG_TRACE(<< "  mean repeats per segment = " << meanRepeatsPerSegment);
+
+    auto condition = softGreaterThan(logSignificance, 1.0, 0.1) &&
+                     softGreaterThan(R, stats.s_AutocorrelationThreshold, 0.1) &&
+                     (vt > v1 ? softGreaterThan(vt / v1, 1.0, 1.0)
+                              : softLessThan(v1 / vt, 0.1, 1.0)) &&
+                     softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2);
+    pVariance = condition.p();
     LOG_TRACE(<< "  p(variance) = " << pVariance);
 
-    if (pVariance >= 1.0) {
+    if (condition) {
         stats.s_R0 = R;
         stats.s_Segmentation = segmentation;
         return true;
@@ -2226,19 +2256,16 @@ bool CPeriodicityHypothesisTests::testAmplitude(const TTimeTimePr2Vec& window,
 
     // Trade off the test significance and the mean number of repeats
     // we've observed.
-    double relativeLogSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
-                                   LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
-    double relativeMeanRepeats{
-        meanRepeats / static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
+    double logSignificance{CTools::fastLog(CTools::oneMinusPowOneMinusX(F1, b)) /
+                           LOG_COMPONENT_STATISTICALLY_SIGNIFICANCE};
+    double meanRepeatsPerSegment{meanRepeats / std::max(stats.s_TrendSegments, 1.0) /
+                                 static_cast<double>(MINIMUM_REPEATS_TO_TEST_AMPLITUDE)};
     double minusLogPVariance{-CTools::fastLog(pVariance)};
-    double segmentsPerRepeat{(stats.s_TrendSegments - 1.0) / meanRepeats};
-    double pAmplitude{CTools::logisticFunction(relativeLogSignificance, 0.2, 1.0) *
-                      CTools::logisticFunction(relativeMeanRepeats, 0.5, 1.0) *
-                      CTools::logisticFunction(minusLogPVariance, 2.0, 0.0, -1.0) *
-                      CTools::logisticFunction(segmentsPerRepeat, 0.3, 0.0, -1.0) / 0.0625};
-    LOG_TRACE(<< "  p(amplitude) = " << pAmplitude);
-
-    if (pAmplitude >= 1.0) {
+    LOG_TRACE(<< "  mean repeats per segment = " << meanRepeatsPerSegment);
+
+    if (softGreaterThan(logSignificance, 1.0, 0.2) &&
+        softLessThan(minusLogPVariance, 0.0, 2.0) &&
+        softGreaterThan(meanRepeatsPerSegment, 1.0, 0.2)) {
         stats.s_R0 = R;
         return true;
     }

diff --git a/lib/maths/CTimeSeriesModel.cc b/lib/maths/CTimeSeriesModel.cc
@@ -1551,8 +1551,9 @@ CUnivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& samples,
         }
     }
 
-    // Time order is not reliable, for example if the data are polled
-    // or for count feature, the times of all samples will be the same.
+    // Time order is not a total order, for example if the data are polled
+    // the times of all samples will be the same. So break ties using the
+    // sample value.
     TSizeVec timeorder(samples.size());
     std::iota(timeorder.begin(), timeorder.end(), 0);
     std::stable_sort(timeorder.begin(), timeorder.end(),
@@ -1656,6 +1657,7 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
     // We can't properly handle periodicity in the variance of the rate if
     // using a Poisson process so remove it from model detectio if we detect
     // seasonality.
+    double numberSamples{m_ResidualModel->numberSamples()};
     m_ResidualModel->removeModels(
         maths::CPrior::CModelFilter().remove(maths::CPrior::E_Poisson));
     m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());
@@ -1665,7 +1667,8 @@ void CUnivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMeanAc
                                  [](double weight, const TFloatMeanAccumulator& sample) {
                                      return weight + CBasicStatistics::count(sample);
                                  })};
-        double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
+        double weightScale{
+            std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
         maths_t::TDoubleWeightsAry1Vec weights(1);
         for (const auto& residual : residuals) {
             double weight(CBasicStatistics::count(residual));
@@ -2862,8 +2865,9 @@ CMultivariateTimeSeriesModel::updateTrend(const TTimeDouble2VecSizeTrVec& sample
         }
     }
 
-    // Time order is not reliable, for example if the data are polled
-    // or for count feature, the times of all samples will be the same.
+    // Time order is not a total order, for example if the data are polled
+    // the times of all samples will be the same. So break ties using the
+    // sample value.
     TSizeVec timeorder(samples.size());
     std::iota(timeorder.begin(), timeorder.end(), 0);
     std::stable_sort(timeorder.begin(), timeorder.end(),
@@ -2965,6 +2969,7 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
     // re-weight so that the total sample weight corresponds to the sample
     // weight the model receives from a fixed (shortish) time interval.
 
+    double numberSamples{m_ResidualModel->numberSamples()};
     m_ResidualModel->setToNonInformative(0.0, m_ResidualModel->decayRate());
 
     if (residuals.size() > 0) {
@@ -2988,7 +2993,8 @@ void CMultivariateTimeSeriesModel::reinitializeStateGivenNewComponent(TFloatMean
         }
 
         double Z{std::accumulate(weights.begin(), weights.end(), 0.0)};
-        double weightScale{10.0 * std::max(this->params().learnRate(), 1.0) / Z};
+        double weightScale{
+            std::min(10.0 * std::max(this->params().learnRate(), 1.0), numberSamples) / Z};
         maths_t::TDouble10VecWeightsAry1Vec weight(1);
         for (std::size_t i = 0; i < samples.size(); ++i) {
             if (weights[i] > 0.0) {

diff --git a/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc b/lib/maths/unittest/CPeriodicityHypothesisTestsTest.cc
@@ -179,7 +179,7 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
         }
 
         LOG_DEBUG(<< "Recall = " << TP / (TP + FN));
-        BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.99);
+        BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.98);
     }
 
     LOG_DEBUG(<< "");
@@ -249,7 +249,8 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
                 maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
                 LOG_DEBUG(<< "result = " << result.print());
                 BOOST_TEST_REQUIRE(
-                    (result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" ||
+                    (result.print() == "{ 'daily' 'weekly' }" ||
+                     result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' }" ||
                      result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"));
                 hypotheses = maths::CPeriodicityHypothesisTests();
                 hypotheses.initialize(0 /*startTime*/, HOUR, window, DAY);
@@ -322,11 +323,9 @@ BOOST_AUTO_TEST_CASE(testDiurnal) {
             core_t::TTime time{timeseries[i].first};
             if (time > lastTest + window) {
                 maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
-                const std::string& printedResult{result.print()};
-                LOG_DEBUG(<< "result = " << printedResult);
-                if (printedResult != "{ 'weekend daily' 'weekday daily' 'weekend weekly' }") {
-                    BOOST_TEST_REQUIRE(printedResult == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }");
-                }
+                LOG_DEBUG(<< "result = " << result.print());
+                BOOST_TEST_REQUIRE((result.print() == "{ 'daily' 'weekly' }" ||
+                                    result.print() == "{ 'weekend daily' 'weekday daily' 'weekend weekly' 'weekday weekly' }"));
                 hypotheses = maths::CPeriodicityHypothesisTests();
                 hypotheses.initialize(0 /*startTime*/, HOUR, window, DAY);
                 lastTest += window;
@@ -510,7 +509,9 @@ BOOST_AUTO_TEST_CASE(testWithSparseData) {
             if (t >= 2) {
                 maths::CPeriodicityHypothesisTestsResult result{hypotheses.test()};
                 LOG_DEBUG(<< "result = " << result.print());
-                BOOST_REQUIRE_EQUAL(std::string("{ 'daily' 'weekly' }"), result.print());
+                BOOST_TEST_REQUIRE(
+                    (result.print() == "{ 'daily' 'weekly' }" ||
+                     result.print() == std::string("{ 'weekend daily' 'weekday daily' }")));
             }
         }
     }
@@ -666,8 +667,9 @@ BOOST_AUTO_TEST_CASE(testWithOutliers) {
             maths::CPeriodicityHypothesisTestsResult result{
                 maths::testForPeriods(config, startTime, bucketLength, values)};
             LOG_DEBUG(<< "result = " << result.print());
-            BOOST_TEST_REQUIRE(result.print() ==
-                               std::string("{ 'weekend daily' 'weekday daily' 'weekend weekly' }"));
+            BOOST_TEST_REQUIRE(
+                (result.print() == std::string("{ 'daily' 'weekly' }") ||
+                 result.print() == std::string("{ 'weekend daily' 'weekday daily' 'weekend weekly' }")));
         }
     }
 }
@@ -947,7 +949,7 @@ BOOST_AUTO_TEST_CASE(testWithPiecewiseLinearTrend) {
     }
 
     LOG_DEBUG(<< "Recall = " << TP / (TP + FN));
-    BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.8);
+    BOOST_TEST_REQUIRE(TP / (TP + FN) > 0.9);
 }
 
 BOOST_AUTO_TEST_SUITE_END()