Enable _terms_enum API for version fields (#93839)

The _terms_enum API currently only supports the keyword, constant_keyword and flattened field type. This change adds support for the `version` field type that sorts according to the semantic versioning definition. Closes #83403
elastic · Feb 21, 2023 · edc7a61 · edc7a61
1 parent cf38fad
commit edc7a61
Show file tree

Hide file tree

Showing 12 changed files with 349 additions and 77 deletions.
diff --git a/docs/changelog/93839.yaml b/docs/changelog/93839.yaml
@@ -0,0 +1,6 @@
+pr: 93839
+summary: Enable _terms_enum on version fields
+area: Search
+type: enhancement
+issues:
+ - 83403
diff --git a/docs/reference/search/terms-enum.asciidoc b/docs/reference/search/terms-enum.asciidoc
@@ -6,8 +6,8 @@
 
 The terms enum API can be used to discover terms in the index that match
 a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
-<<constant-keyword-field-type,`constant_keyword`>> and 
-<<flattened,`flattened`>>. This is used for auto-complete:
+<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
+and <<version, `version`>>. This is used for auto-complete:
 
 [source,console]
 --------------------------------------------------
@@ -109,4 +109,4 @@ query rewrites to `match_none`.
 (Optional, string)
 The string after which terms in the index should be returned. Allows for a form of
 pagination if the last result from one request is passed as the `search_after`
-parameter for a subsequent request.
+parameter for a subsequent request.
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -17,7 +17,6 @@
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.index.DocValuesType;
-import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
@@ -561,26 +560,6 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
             return terms.intersect(automaton, searchBytes);
         }
 
-        // Initialises with a seek to a given term but excludes that term
-        // from any results. The problem it addresses is that termsEnum.seekCeil()
-        // would work but either leaves us positioned on the seek term (if it exists) or the
-        // term after (if the seek term doesn't exist). That complicates any subsequent
-        // iteration logic so this class simplifies the pagination use case.
-        static final class SearchAfterTermsEnum extends FilteredTermsEnum {
-            private final BytesRef afterRef;
-
-            SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
-                super(tenum);
-                afterRef = termText;
-                setInitialSeekTerm(termText);
-            }
-
-            @Override
-            protected AcceptStatus accept(BytesRef term) {
-                return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
-            }
-        }
-
         /**
          * A simple terms implementation for SortedSetDocValues that only provides access to {@link TermsEnum} via
          * {@link #iterator} and {@link #intersect(CompiledAutomaton, BytesRef)} methods.
@@ -898,6 +877,7 @@ public int ignoreAbove() {
         /**
          * @return true if field has been marked as a dimension field
          */
+        @Override
         public boolean isDimension() {
             return isDimension;
         }

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/SearchAfterTermsEnum.java b/server/src/main/java/org/elasticsearch/index/mapper/SearchAfterTermsEnum.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.FilteredTermsEnum;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This terms enumeration initializes with a seek to a given term but excludes that term
+ * from any results. The problem it addresses is that termsEnum.seekCeil()
+ * would work but either leaves us positioned on the seek term (if it exists) or the
+ * term after (if the seek term doesn't exist). That complicates any subsequent
+ * iteration logic so this class simplifies the pagination use case.
+ */
+public final class SearchAfterTermsEnum extends FilteredTermsEnum {
+    private final BytesRef afterRef;
+
+    public SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
+        super(tenum);
+        afterRef = termText;
+        setInitialSeekTerm(termText);
+    }
+
+    @Override
+    protected AcceptStatus accept(BytesRef term) {
+        return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
+    }
+}
diff --git a/...core/src/main/java/org/elasticsearch/xpack/core/termsenum/action/MultiShardTermsEnum.java b/...core/src/main/java/org/elasticsearch/xpack/core/termsenum/action/MultiShardTermsEnum.java
@@ -22,14 +22,15 @@
 import org.apache.lucene.util.PriorityQueue;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
 
 /**
  * Merges terms and stats from multiple TermEnum classes
  * This does a merge sort, by term text.
- * Adapted from Lucene's MultiTermsEnum and differs in that:
- * 1) Only next(), term() and docFreq() methods are supported
- * 2) Doc counts are longs not ints.
- *
+ * Adapted from Lucene's MultiTermsEnum and differs in that
+ * only next() and term() are supported.
  */
 public final class MultiShardTermsEnum {
 
@@ -38,45 +39,67 @@ public final class MultiShardTermsEnum {
 
     private int numTop;
     private BytesRef current;
+    private Function<Object, Object> termsDecoder;
+
+    private record ShardTermsEnum(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {};
+
+    public static class Builder {
 
-    /** Sole constructor.
+        private final List<ShardTermsEnum> shardTermsEnums = new ArrayList<>();
+
+        void add(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {
+            this.shardTermsEnums.add(new ShardTermsEnum(termsEnum, termsDecoder));
+        }
+
+        MultiShardTermsEnum build() throws IOException {
+            return new MultiShardTermsEnum(shardTermsEnums);
+        }
+
+        int size() {
+            return shardTermsEnums.size();
+        }
+    }
+
+    /**
      * @param enums TermsEnums from shards which we should merge
      * @throws IOException Errors accessing data
      **/
-    public MultiShardTermsEnum(TermsEnum[] enums) throws IOException {
-        queue = new TermMergeQueue(enums.length);
-        top = new TermsEnumWithCurrent[enums.length];
+    private MultiShardTermsEnum(List<ShardTermsEnum> enums) throws IOException {
+        queue = new TermMergeQueue(enums.size());
+        top = new TermsEnumWithCurrent[enums.size()];
         numTop = 0;
         queue.clear();
-        for (int i = 0; i < enums.length; i++) {
-            final TermsEnum termsEnum = enums[i];
+        for (ShardTermsEnum shardEnum : enums) {
+            final TermsEnum termsEnum = shardEnum.termsEnum();
             final BytesRef term = termsEnum.next();
             if (term != null) {
                 final TermsEnumWithCurrent entry = new TermsEnumWithCurrent();
                 entry.current = term;
-                entry.terms = termsEnum;
+                entry.termsEnum = termsEnum;
+                entry.termsDecoder = shardEnum.termsDecoder();
                 queue.add(entry);
             } else {
                 // field has no terms
             }
         }
     }
 
-    public BytesRef term() {
-        return current;
+    public String decodedTerm() {
+        return this.termsDecoder.apply(current).toString();
     }
 
     private void pullTop() {
         assert numTop == 0;
         numTop = queue.fillTop(top);
         current = top[0].current;
+        termsDecoder = top[0].termsDecoder;
     }
 
     private void pushTop() throws IOException {
         // call next() on each top, and reorder queue
         for (int i = 0; i < numTop; i++) {
             TermsEnumWithCurrent termsEnum = queue.top();
-            termsEnum.current = termsEnum.terms.next();
+            termsEnum.current = termsEnum.termsEnum.next();
             if (termsEnum.current == null) {
                 queue.pop();
             } else {
@@ -96,21 +119,13 @@ public BytesRef next() throws IOException {
         } else {
             current = null;
         }
-
         return current;
     }
 
-    public long docFreq() throws IOException {
-        long sum = 0;
-        for (int i = 0; i < numTop; i++) {
-            sum += top[i].terms.docFreq();
-        }
-        return sum;
-    }
-
-    static final class TermsEnumWithCurrent {
-        TermsEnum terms;
-        public BytesRef current;
+    private static final class TermsEnumWithCurrent {
+        private Function<Object, Object> termsDecoder;
+        private TermsEnum termsEnum;
+        private BytesRef current;
     }
 
     private static final class TermMergeQueue extends PriorityQueue<TermsEnumWithCurrent> {
@@ -126,8 +141,10 @@ protected boolean lessThan(TermsEnumWithCurrent termsA, TermsEnumWithCurrent ter
             return termsA.current.compareTo(termsB.current) < 0;
         }
 
-        /** Add the {@link #top()} slice as well as all slices that are positioned
-         *  on the same term to {@code tops} and return how many of them there are. */
+        /**
+         * Add the {@link #top()} slice as well as all slices that are positioned
+         * on the same term to {@code tops} and return how many of them there are.
+         */
         int fillTop(TermsEnumWithCurrent[] tops) {
             final int size = size();
             if (size == 0) {

diff --git a/...src/main/java/org/elasticsearch/xpack/core/termsenum/action/TransportTermsEnumAction.java b/...src/main/java/org/elasticsearch/xpack/core/termsenum/action/TransportTermsEnumAction.java
@@ -7,7 +7,6 @@
 package org.elasticsearch.xpack.core.termsenum.action;
 
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.PriorityQueue;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
@@ -329,9 +328,9 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
         long timeout_millis = request.timeout();
         long scheduledEnd = request.nodeStartedTimeMillis() + timeout_millis;
 
-        ArrayList<TermsEnum> shardTermsEnums = new ArrayList<>();
         ArrayList<Closeable> openedResources = new ArrayList<>();
         try {
+            MultiShardTermsEnum.Builder teBuilder = new MultiShardTermsEnum.Builder();
             for (ShardId shardId : request.shardIds()) {
                 // Check we haven't just arrived on a node and time is up already.
                 if (System.currentTimeMillis() > scheduledEnd) {
@@ -359,15 +358,15 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
                         request.searchAfter()
                     );
                     if (terms != null) {
-                        shardTermsEnums.add(terms);
+                        teBuilder.add(terms, mappedFieldType::valueForDisplay);
                     }
                 }
             }
-            if (shardTermsEnums.size() == 0) {
+            if (teBuilder.size() == 0) {
                 // No term enums available
                 return new NodeTermsEnumResponse(request.nodeId(), termsList, error, true);
             }
-            MultiShardTermsEnum te = new MultiShardTermsEnum(shardTermsEnums.toArray(new TermsEnum[0]));
+            MultiShardTermsEnum te = teBuilder.build();
 
             int shard_size = request.size();
             // All the above prep might take a while - do a timer check now before we continue further.
@@ -387,8 +386,7 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
                     }
                     termCount = 0;
                 }
-                BytesRef bytes = te.term();
-                termsList.add(bytes.utf8ToString());
+                termsList.add(te.decodedTerm());
                 if (termsList.size() >= shard_size) {
                     break;
                 }

diff --git a/...e/termsenum/MultiShardTermsEnumTests.java → ...enum/action/MultiShardTermsEnumTests.java b/...e/termsenum/MultiShardTermsEnumTests.java → ...enum/action/MultiShardTermsEnumTests.java
@@ -4,7 +4,7 @@
  * 2.0; you may not use this file except in compliance with the Elastic License
  * 2.0.
  */
-package org.elasticsearch.xpack.core.termsenum;
+package org.elasticsearch.xpack.core.termsenum.action;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -18,6 +18,7 @@
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.analysis.MockAnalyzer;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
@@ -26,8 +27,6 @@
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
 import org.elasticsearch.core.IOUtils;
 import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.xpack.core.termsenum.action.MultiShardTermsEnum;
-import org.elasticsearch.xpack.core.termsenum.action.SimpleTermCountEnum;
 
 import java.io.Closeable;
 import java.util.ArrayList;
@@ -75,7 +74,7 @@ public void testRandomIndexFusion() throws Exception {
                 a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
                 CompiledAutomaton automaton = new CompiledAutomaton(a);
 
-                ArrayList<TermsEnum> termsEnums = new ArrayList<>();
+                MultiShardTermsEnum.Builder builder = new MultiShardTermsEnum.Builder();
                 for (DirectoryReader reader : readers) {
                     Terms terms = MultiTerms.getTerms(reader, fieldName);
                     TermsEnum te = automaton.getTermsEnum(terms);
@@ -86,13 +85,12 @@ public void testRandomIndexFusion() throws Exception {
                         while (te.next() != null) {
                             termCounts.add(te.term().utf8ToString());
                         }
-                        SimpleTermCountEnum simpleEnum = new SimpleTermCountEnum(termCounts.toArray(new String[0]));
-                        termsEnums.add(simpleEnum);
+                        builder.add(new SimpleTermCountEnum(termCounts.toArray(new String[0])), o -> ((BytesRef) o).utf8ToString());
                     } else {
-                        termsEnums.add(te);
+                        builder.add(te, o -> ((BytesRef) o).utf8ToString());
                     }
                 }
-                MultiShardTermsEnum mte = new MultiShardTermsEnum(termsEnums.toArray(new TermsEnum[0]));
+                MultiShardTermsEnum mte = builder.build();
                 Set<String> expecteds = new HashSet<>();
 
                 for (String term : globalTermCounts) {
@@ -102,7 +100,7 @@ public void testRandomIndexFusion() throws Exception {
                 }
 
                 while (mte.next() != null) {
-                    String teString = mte.term().utf8ToString();
+                    String teString = mte.decodedTerm();
                     assertTrue(expecteds.contains(teString));
                     expecteds.remove(teString);
                 }