[ML] Avoid very low p-values if the term is only a tiny fraction of t…

…he foreground set (#76764) (#76772) Whilst testing the p_value scoring heuristic for significant terms introduced in #75313 it became clear we can assign arbitrarily low p-values if the overall counts are high enough for terms which constitute a very small fraction of the foreground set. Even if the difference in their frequency on the foreground and background set is statistically significant they don't explain the majority of the foreground cases and so are not of significant interest (certainly not in the use cases we have for this aggregation). We already have some mitigation for the cases that 1. the term frequency is small on both the foreground and background set, 2. the term frequencies are very similar. These offset the actual term counts by a fixed small fraction of the background counts and make the foreground and background frequencies more similar by a small relative amount, respectively. This change simply applies offsets to the term counts before making frequencies more similar. For frequencies much less than the offset we therefore get equal frequencies on the foreground and background sets and p-value tends to 1. This retains the advantage of being a smooth correction to the p-value so we get no strange discontinuities in the vicinity of the small absolute and difference thresholds for the frequency. Co-authored-by: Tom Veasey <tveasey@users.noreply.github.com> Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
elastic · Aug 23, 2021 · ac6c1b3 · ac6c1b3
1 parent 74f86c8
commit ac6c1b3
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 30 deletions.
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScore.java
@@ -113,48 +113,50 @@ public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long
             return 0.0;
         }
 
+        // casting to `long` to round down to nearest whole number
+        double epsAllDocsInClass = (long)eps(allDocsInClass);
+        double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);
+
+        docsContainTermInClass += epsAllDocsInClass;
+        docsContainTermNotInClass += epsAllDocsNotInClass;
+        allDocsInClass += epsAllDocsInClass;
+        allDocsNotInClass += epsAllDocsNotInClass;
+
         // Adjust counts to ignore ratio changes which are less than 5%
         // casting to `long` to round down to nearest whole number
         docsContainTermNotInClass = (long)(Math.min(
             1.05 * docsContainTermNotInClass,
             docsContainTermInClass / allDocsInClass * allDocsNotInClass
         ) + 0.5);
 
-        // casting to `long` to round down to nearest whole number
-        double epsAllDocsInClass = (long)eps(allDocsInClass);
-        double epsAllDocsNotInClass = (long)eps(allDocsNotInClass);
-
-        if ((allDocsInClass + epsAllDocsInClass) > Long.MAX_VALUE
-            || (docsContainTermInClass + epsAllDocsInClass) > Long.MAX_VALUE
-            || (allDocsNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE
-            || (docsContainTermNotInClass + epsAllDocsNotInClass) > Long.MAX_VALUE) {
+        if (allDocsInClass > Long.MAX_VALUE
+            || docsContainTermInClass > Long.MAX_VALUE
+            || allDocsNotInClass > Long.MAX_VALUE
+            || docsContainTermNotInClass > Long.MAX_VALUE) {
             throw new AggregationExecutionException(
                 "too many documents in background and foreground sets, further restrict sets for execution"
             );
         }
 
         double v1 = new LongBinomialDistribution(
-            (long)(allDocsInClass + epsAllDocsInClass),
-            (docsContainTermInClass + epsAllDocsInClass)/(allDocsInClass + epsAllDocsInClass)
-        ).logProbability((long)(docsContainTermInClass + epsAllDocsInClass));
+            (long)allDocsInClass, docsContainTermInClass / allDocsInClass
+        ).logProbability((long)docsContainTermInClass);
 
         double v2 = new LongBinomialDistribution(
-            (long)(allDocsNotInClass + epsAllDocsNotInClass),
-            (docsContainTermNotInClass + epsAllDocsNotInClass)/(allDocsNotInClass + epsAllDocsNotInClass)
-        ).logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass));
+            (long)allDocsNotInClass, docsContainTermNotInClass / allDocsNotInClass
+        ).logProbability((long)docsContainTermNotInClass);
 
-        double p2 = (docsContainTermInClass + docsContainTermNotInClass + epsAllDocsNotInClass + epsAllDocsInClass)
-            / (allDocsInClass + allDocsNotInClass + epsAllDocsNotInClass + epsAllDocsInClass);
+        double p2 = (docsContainTermInClass + docsContainTermNotInClass) / (allDocsInClass + allDocsNotInClass);
 
-        double v3 = new LongBinomialDistribution((long)(allDocsInClass + epsAllDocsInClass), p2)
-            .logProbability((long)(docsContainTermInClass + epsAllDocsInClass));
+        double v3 = new LongBinomialDistribution((long)allDocsInClass, p2)
+            .logProbability((long)docsContainTermInClass);
 
-        double v4 = new LongBinomialDistribution((long)(allDocsNotInClass + epsAllDocsNotInClass), p2)
-            .logProbability((long)(docsContainTermNotInClass + epsAllDocsNotInClass));
+        double v4 = new LongBinomialDistribution((long)allDocsNotInClass, p2)
+            .logProbability((long)docsContainTermNotInClass);
 
         double logLikelihoodRatio = v1 + v2 - v3 - v4;
         double pValue = CHI_SQUARED_DISTRIBUTION.survivalFunction(2.0 * logLikelihoodRatio);
-        return -FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL));
+        return FastMath.max(-FastMath.log(FastMath.max(pValue, Double.MIN_NORMAL)), 0.0);
     }
 
     private double eps(double value) {

diff --git a/...k/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java b/...k/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/aggs/heuristic/PValueScoreTests.java
@@ -22,7 +22,6 @@
 import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.closeTo;
 import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 
@@ -110,31 +109,31 @@ public void testPValueScore() {
         );
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(10, 100, 10, 1000)),
-            closeTo(0.002988594884934073, eps)
+            closeTo(0.003972388976814195, eps)
         );
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(10, 100, 200, 1000)),
             closeTo(1.0, eps)
         );
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(20, 10000, 5, 10000)),
-            closeTo(0.6309430298306147, eps)
+            closeTo(1.0, eps)
         );
     }
 
     public void testSmallChanges() {
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(1, 4205, 0, 821496)),
-            closeTo(0.9572480202044421, eps)
+            closeTo(0.9999037287868853, eps)
         );
         // Same(ish) ratios
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(10, 4205, 195, 82149)),
-            closeTo(0.9893886454928338, eps)
+            closeTo(0.9995943820612134, eps)
         );
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(10, 4205, 1950, 821496)),
-            closeTo(0.9867689169546193, eps)
+            closeTo(0.9999942565428899, eps)
         );
 
         // 4% vs 0%
@@ -145,12 +144,12 @@ public void testSmallChanges() {
         // 4% vs 2%
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(168, 4205, 16429, 821496)),
-            closeTo(4.78464746423625e-06, eps)
+            closeTo(8.542608559219833e-5, eps)
         );
         // 4% vs 3.5%
         assertThat(
             FastMath.exp(-new PValueScore(false).getScore(168, 4205, 28752, 821496)),
-            closeTo(0.4728938449949742, eps)
+            closeTo(0.8833950526957098, eps)
         );
     }
 
@@ -186,7 +185,7 @@ public void testIncreasedSubsetIncreasedScore() {
         for (int j = 1; j < 11; j++) {
             double nextScore = getScore.apply(j*10L);
             assertThat(nextScore, greaterThanOrEqualTo(0.0));
-            assertThat(nextScore, greaterThan(priorScore));
+            assertThat(nextScore, greaterThanOrEqualTo(priorScore));
             priorScore = nextScore;
         }
     }