Adds recall@k metric to rank eval API (#52577)

This change adds the recall@k metric and refactors precision@k to match the new metric. Recall@k is an important metric to use for learning to rank (LTR) use-cases. Candidate generation or first ranking phase ranking functions are often optimized for high recall, in order to generate as many relevant candidates in the top-k as possible for a second phase of ranking. Adding this metric allows tuning that base query for LTR. See: #51676
elastic · Feb 27, 2020 · 4ff5e03 · 4ff5e03
1 parent 3e03928
commit 4ff5e03
Show file tree

Hide file tree

Showing 11 changed files with 744 additions and 129 deletions.
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/RankEvalIT.java
@@ -28,6 +28,7 @@
 import org.elasticsearch.index.rankeval.ExpectedReciprocalRank;
 import org.elasticsearch.index.rankeval.MeanReciprocalRank;
 import org.elasticsearch.index.rankeval.PrecisionAtK;
+import org.elasticsearch.index.rankeval.RecallAtK;
 import org.elasticsearch.index.rankeval.RankEvalRequest;
 import org.elasticsearch.index.rankeval.RankEvalResponse;
 import org.elasticsearch.index.rankeval.RankEvalSpec;
@@ -130,9 +131,9 @@ private static List<RatedRequest> createTestEvaluationSpec() {
      */
     public void testMetrics() throws IOException {
         List<RatedRequest> specifications = createTestEvaluationSpec();
-        List<Supplier<EvaluationMetric>> metrics = Arrays.asList(PrecisionAtK::new, MeanReciprocalRank::new, DiscountedCumulativeGain::new,
-                () -> new ExpectedReciprocalRank(1));
-        double expectedScores[] = new double[] {0.4285714285714286, 0.75, 1.6408962261063627, 0.4407738095238095};
+        List<Supplier<EvaluationMetric>> metrics = Arrays.asList(PrecisionAtK::new, RecallAtK::new,
+            MeanReciprocalRank::new, DiscountedCumulativeGain::new, () -> new ExpectedReciprocalRank(1));
+        double expectedScores[] = new double[] {0.4285714285714286, 1.0, 0.75, 1.6408962261063627, 0.4407738095238095};
         int i = 0;
         for (Supplier<EvaluationMetric> metricSupplier : metrics) {
             RankEvalSpec spec = new RankEvalSpec(specifications, metricSupplier.get());

diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/RestHighLevelClientTests.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/RestHighLevelClientTests.java
@@ -98,6 +98,7 @@
 import org.elasticsearch.index.rankeval.MeanReciprocalRank;
 import org.elasticsearch.index.rankeval.MetricDetail;
 import org.elasticsearch.index.rankeval.PrecisionAtK;
+import org.elasticsearch.index.rankeval.RecallAtK;
 import org.elasticsearch.join.aggregations.ChildrenAggregationBuilder;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHits;
@@ -696,7 +697,7 @@ public void testDefaultNamedXContents() {
 
     public void testProvidedNamedXContents() {
         List<NamedXContentRegistry.Entry> namedXContents = RestHighLevelClient.getProvidedNamedXContents();
-        assertEquals(57, namedXContents.size());
+        assertEquals(59, namedXContents.size());
         Map<Class<?>, Integer> categories = new HashMap<>();
         List<String> names = new ArrayList<>();
         for (NamedXContentRegistry.Entry namedXContent : namedXContents) {
@@ -710,13 +711,15 @@ public void testProvidedNamedXContents() {
         assertEquals(Integer.valueOf(3), categories.get(Aggregation.class));
         assertTrue(names.contains(ChildrenAggregationBuilder.NAME));
         assertTrue(names.contains(MatrixStatsAggregationBuilder.NAME));
-        assertEquals(Integer.valueOf(4), categories.get(EvaluationMetric.class));
+        assertEquals(Integer.valueOf(5), categories.get(EvaluationMetric.class));
         assertTrue(names.contains(PrecisionAtK.NAME));
+        assertTrue(names.contains(RecallAtK.NAME));
         assertTrue(names.contains(DiscountedCumulativeGain.NAME));
         assertTrue(names.contains(MeanReciprocalRank.NAME));
         assertTrue(names.contains(ExpectedReciprocalRank.NAME));
-        assertEquals(Integer.valueOf(4), categories.get(MetricDetail.class));
+        assertEquals(Integer.valueOf(5), categories.get(MetricDetail.class));
         assertTrue(names.contains(PrecisionAtK.NAME));
+        assertTrue(names.contains(RecallAtK.NAME));
         assertTrue(names.contains(MeanReciprocalRank.NAME));
         assertTrue(names.contains(DiscountedCumulativeGain.NAME));
         assertTrue(names.contains(ExpectedReciprocalRank.NAME));

diff --git a/docs/reference/search/rank-eval.asciidoc b/docs/reference/search/rank-eval.asciidoc
@@ -203,20 +203,21 @@ will be used. The following metrics are supported:
 [[k-precision]]
 ===== Precision at K (P@k)
 
-This metric measures the number of relevant results in the top k search results. 
-It's a form of the well-known 
-https://en.wikipedia.org/wiki/Information_retrieval#Precision[Precision] metric 
-that only looks at the top k documents. It is the fraction of relevant documents 
-in those first k results. A precision at 10 (P@10) value of 0.6 then means six 
-out of the 10 top hits are relevant with respect to the user's information need.
-
-P@k works well as a simple evaluation metric that has the benefit of being easy 
-to understand and explain. Documents in the collection need to be rated as either 
-relevant or irrelevant with respect to the current query. P@k does not take 
-into account the position of the relevant documents within the top k results, 
-so a ranking of ten results that contains one relevant result in position 10 is 
-equally as good as a ranking of ten results that contains one relevant result  
-in position 1.
+This metric measures the proportion of relevant results in the top k search results.
+It's a form of the well-known
+https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision[Precision]
+metric that only looks at the top k documents. It is the fraction of relevant
+documents in those first k results. A precision at 10 (P@10) value of 0.6 then
+means 6 out of the 10 top hits are relevant with respect to the user's
+information need.
+
+P@k works well as a simple evaluation metric that has the benefit of being easy
+to understand and explain. Documents in the collection need to be rated as either
+relevant or irrelevant with respect to the current query. P@k is a set-based
+metric and does not take into account the position of the relevant documents
+within the top k results, so a ranking of ten results that contains one
+relevant result in position 10 is equally as good as a ranking of ten results
+that contains one relevant result in position 1.
 
 [source,console]
 --------------------------------
@@ -253,6 +254,58 @@ If set to 'true', unlabeled documents are ignored and neither count as relevant
 |=======================================================================
 
 
+[float]
+[[k-recall]]
+===== Recall at K (R@k)
+
+This metric measures the total number of relevant results in the top k search
+results. It's a form of the well-known
+https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Recall[Recall]
+metric. It is the fraction of relevant documents in those first k results
+relative to all possible relevant results. A recall at 10 (R@10) value of 0.5 then
+means 4 out of 8 relevant documents, with respect to the user's information
+need, were retrieved in the 10 top hits.
+
+R@k works well as a simple evaluation metric that has the benefit of being easy
+to understand and explain. Documents in the collection need to be rated as either
+relevant or irrelevant with respect to the current query. R@k is a set-based
+metric and does not take into account the position of the relevant documents
+within the top k results, so a ranking of ten results that contains one
+relevant result in position 10 is equally as good as a ranking of ten results
+that contains one relevant result in position 1.
+
+[source,console]
+--------------------------------
+GET /twitter/_rank_eval
+{
+    "requests": [
+    {
+        "id": "JFK query",
+        "request": { "query": { "match_all": {}}},
+        "ratings": []
+    }],
+    "metric": {
+      "recall": {
+        "k" : 20,
+        "relevant_rating_threshold": 1
+      }
+   }
+}
+--------------------------------
+// TEST[setup:twitter]
+
+The `recall` metric takes the following optional parameters
+
+[cols="<,<",options="header",]
+|=======================================================================
+|Parameter |Description
+|`k` |sets the maximum number of documents retrieved per query. This value will act in place of the usual `size` parameter
+in the query. Defaults to 10.
+|`relevant_rating_threshold` |sets the rating threshold above which documents are considered to be
+"relevant". Defaults to `1`.
+|=======================================================================
+
+
 [float]
 ===== Mean reciprocal rank
 

diff --git a/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MetricDetail.java b/modules/rank-eval/src/main/java/org/elasticsearch/index/rankeval/MetricDetail.java
@@ -26,7 +26,7 @@
 import java.io.IOException;
 
 /**
- * Details about a specific {@link EvaluationMetric} that should be included in the resonse.
+ * Details about a specific {@link EvaluationMetric} that should be included in the response.
  */
 public interface MetricDetail extends ToXContentObject, NamedWriteable {