Skip to content

Commit

Permalink
Enable _terms_enum API for version fields (#93839)
Browse files Browse the repository at this point in the history
The _terms_enum API currently only supports the keyword, constant_keyword 
and flattened field type. This change adds support for the `version` field type
that sorts according to the semantic versioning definition.

Closes #83403
  • Loading branch information
cbuescher committed Feb 21, 2023
1 parent cf38fad commit edc7a61
Show file tree
Hide file tree
Showing 12 changed files with 349 additions and 77 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/93839.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 93839
summary: Enable _terms_enum on version fields
area: Search
type: enhancement
issues:
- 83403
6 changes: 3 additions & 3 deletions docs/reference/search/terms-enum.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

The terms enum API can be used to discover terms in the index that match
a partial string. Supported field types are <<keyword-field-type,`keyword`>>,
<<constant-keyword-field-type,`constant_keyword`>> and
<<flattened,`flattened`>>. This is used for auto-complete:
<<constant-keyword-field-type,`constant_keyword`>>, <<flattened,`flattened`>>
and <<version, `version`>>. This is used for auto-complete:

[source,console]
--------------------------------------------------
Expand Down Expand Up @@ -109,4 +109,4 @@ query rewrites to `match_none`.
(Optional, string)
The string after which terms in the index should be returned. Allows for a form of
pagination if the last result from one request is passed as the `search_after`
parameter for a subsequent request.
parameter for a subsequent request.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
Expand Down Expand Up @@ -561,26 +560,6 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
return terms.intersect(automaton, searchBytes);
}

// Initialises with a seek to a given term but excludes that term
// from any results. The problem it addresses is that termsEnum.seekCeil()
// would work but either leaves us positioned on the seek term (if it exists) or the
// term after (if the seek term doesn't exist). That complicates any subsequent
// iteration logic so this class simplifies the pagination use case.
static final class SearchAfterTermsEnum extends FilteredTermsEnum {
private final BytesRef afterRef;

SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
super(tenum);
afterRef = termText;
setInitialSeekTerm(termText);
}

@Override
protected AcceptStatus accept(BytesRef term) {
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
}
}

/**
* A simple terms implementation for SortedSetDocValues that only provides access to {@link TermsEnum} via
* {@link #iterator} and {@link #intersect(CompiledAutomaton, BytesRef)} methods.
Expand Down Expand Up @@ -898,6 +877,7 @@ public int ignoreAbove() {
/**
* @return true if field has been marked as a dimension field
*/
@Override
public boolean isDimension() {
return isDimension;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.index.mapper;

import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;

/**
* This terms enumeration initializes with a seek to a given term but excludes that term
* from any results. The problem it addresses is that termsEnum.seekCeil()
* would work but either leaves us positioned on the seek term (if it exists) or the
* term after (if the seek term doesn't exist). That complicates any subsequent
* iteration logic so this class simplifies the pagination use case.
*/
public final class SearchAfterTermsEnum extends FilteredTermsEnum {
private final BytesRef afterRef;

public SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
super(tenum);
afterRef = termText;
setInitialSeekTerm(termText);
}

@Override
protected AcceptStatus accept(BytesRef term) {
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@
import org.apache.lucene.util.PriorityQueue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;

/**
* Merges terms and stats from multiple TermEnum classes
* This does a merge sort, by term text.
* Adapted from Lucene's MultiTermsEnum and differs in that:
* 1) Only next(), term() and docFreq() methods are supported
* 2) Doc counts are longs not ints.
*
* Adapted from Lucene's MultiTermsEnum and differs in that
* only next() and term() are supported.
*/
public final class MultiShardTermsEnum {

Expand All @@ -38,45 +39,67 @@ public final class MultiShardTermsEnum {

private int numTop;
private BytesRef current;
private Function<Object, Object> termsDecoder;

private record ShardTermsEnum(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {};

public static class Builder {

/** Sole constructor.
private final List<ShardTermsEnum> shardTermsEnums = new ArrayList<>();

void add(TermsEnum termsEnum, Function<Object, Object> termsDecoder) {
this.shardTermsEnums.add(new ShardTermsEnum(termsEnum, termsDecoder));
}

MultiShardTermsEnum build() throws IOException {
return new MultiShardTermsEnum(shardTermsEnums);
}

int size() {
return shardTermsEnums.size();
}
}

/**
* @param enums TermsEnums from shards which we should merge
* @throws IOException Errors accessing data
**/
public MultiShardTermsEnum(TermsEnum[] enums) throws IOException {
queue = new TermMergeQueue(enums.length);
top = new TermsEnumWithCurrent[enums.length];
private MultiShardTermsEnum(List<ShardTermsEnum> enums) throws IOException {
queue = new TermMergeQueue(enums.size());
top = new TermsEnumWithCurrent[enums.size()];
numTop = 0;
queue.clear();
for (int i = 0; i < enums.length; i++) {
final TermsEnum termsEnum = enums[i];
for (ShardTermsEnum shardEnum : enums) {
final TermsEnum termsEnum = shardEnum.termsEnum();
final BytesRef term = termsEnum.next();
if (term != null) {
final TermsEnumWithCurrent entry = new TermsEnumWithCurrent();
entry.current = term;
entry.terms = termsEnum;
entry.termsEnum = termsEnum;
entry.termsDecoder = shardEnum.termsDecoder();
queue.add(entry);
} else {
// field has no terms
}
}
}

public BytesRef term() {
return current;
public String decodedTerm() {
return this.termsDecoder.apply(current).toString();
}

private void pullTop() {
assert numTop == 0;
numTop = queue.fillTop(top);
current = top[0].current;
termsDecoder = top[0].termsDecoder;
}

private void pushTop() throws IOException {
// call next() on each top, and reorder queue
for (int i = 0; i < numTop; i++) {
TermsEnumWithCurrent termsEnum = queue.top();
termsEnum.current = termsEnum.terms.next();
termsEnum.current = termsEnum.termsEnum.next();
if (termsEnum.current == null) {
queue.pop();
} else {
Expand All @@ -96,21 +119,13 @@ public BytesRef next() throws IOException {
} else {
current = null;
}

return current;
}

public long docFreq() throws IOException {
long sum = 0;
for (int i = 0; i < numTop; i++) {
sum += top[i].terms.docFreq();
}
return sum;
}

static final class TermsEnumWithCurrent {
TermsEnum terms;
public BytesRef current;
private static final class TermsEnumWithCurrent {
private Function<Object, Object> termsDecoder;
private TermsEnum termsEnum;
private BytesRef current;
}

private static final class TermMergeQueue extends PriorityQueue<TermsEnumWithCurrent> {
Expand All @@ -126,8 +141,10 @@ protected boolean lessThan(TermsEnumWithCurrent termsA, TermsEnumWithCurrent ter
return termsA.current.compareTo(termsB.current) < 0;
}

/** Add the {@link #top()} slice as well as all slices that are positioned
* on the same term to {@code tops} and return how many of them there are. */
/**
* Add the {@link #top()} slice as well as all slices that are positioned
* on the same term to {@code tops} and return how many of them there are.
*/
int fillTop(TermsEnumWithCurrent[] tops) {
final int size = size();
if (size == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
package org.elasticsearch.xpack.core.termsenum.action;

import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
Expand Down Expand Up @@ -329,9 +328,9 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
long timeout_millis = request.timeout();
long scheduledEnd = request.nodeStartedTimeMillis() + timeout_millis;

ArrayList<TermsEnum> shardTermsEnums = new ArrayList<>();
ArrayList<Closeable> openedResources = new ArrayList<>();
try {
MultiShardTermsEnum.Builder teBuilder = new MultiShardTermsEnum.Builder();
for (ShardId shardId : request.shardIds()) {
// Check we haven't just arrived on a node and time is up already.
if (System.currentTimeMillis() > scheduledEnd) {
Expand Down Expand Up @@ -359,15 +358,15 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
request.searchAfter()
);
if (terms != null) {
shardTermsEnums.add(terms);
teBuilder.add(terms, mappedFieldType::valueForDisplay);
}
}
}
if (shardTermsEnums.size() == 0) {
if (teBuilder.size() == 0) {
// No term enums available
return new NodeTermsEnumResponse(request.nodeId(), termsList, error, true);
}
MultiShardTermsEnum te = new MultiShardTermsEnum(shardTermsEnums.toArray(new TermsEnum[0]));
MultiShardTermsEnum te = teBuilder.build();

int shard_size = request.size();
// All the above prep might take a while - do a timer check now before we continue further.
Expand All @@ -387,8 +386,7 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
}
termCount = 0;
}
BytesRef bytes = te.term();
termsList.add(bytes.utf8ToString());
termsList.add(te.decodedTerm());
if (termsList.size() >= shard_size) {
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
package org.elasticsearch.xpack.core.termsenum;
package org.elasticsearch.xpack.core.termsenum.action;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
Expand All @@ -18,6 +18,7 @@
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.analysis.MockAnalyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
Expand All @@ -26,8 +27,6 @@
import org.elasticsearch.common.lucene.search.AutomatonQueries;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.core.termsenum.action.MultiShardTermsEnum;
import org.elasticsearch.xpack.core.termsenum.action.SimpleTermCountEnum;

import java.io.Closeable;
import java.util.ArrayList;
Expand Down Expand Up @@ -75,7 +74,7 @@ public void testRandomIndexFusion() throws Exception {
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
CompiledAutomaton automaton = new CompiledAutomaton(a);

ArrayList<TermsEnum> termsEnums = new ArrayList<>();
MultiShardTermsEnum.Builder builder = new MultiShardTermsEnum.Builder();
for (DirectoryReader reader : readers) {
Terms terms = MultiTerms.getTerms(reader, fieldName);
TermsEnum te = automaton.getTermsEnum(terms);
Expand All @@ -86,13 +85,12 @@ public void testRandomIndexFusion() throws Exception {
while (te.next() != null) {
termCounts.add(te.term().utf8ToString());
}
SimpleTermCountEnum simpleEnum = new SimpleTermCountEnum(termCounts.toArray(new String[0]));
termsEnums.add(simpleEnum);
builder.add(new SimpleTermCountEnum(termCounts.toArray(new String[0])), o -> ((BytesRef) o).utf8ToString());
} else {
termsEnums.add(te);
builder.add(te, o -> ((BytesRef) o).utf8ToString());
}
}
MultiShardTermsEnum mte = new MultiShardTermsEnum(termsEnums.toArray(new TermsEnum[0]));
MultiShardTermsEnum mte = builder.build();
Set<String> expecteds = new HashSet<>();

for (String term : globalTermCounts) {
Expand All @@ -102,7 +100,7 @@ public void testRandomIndexFusion() throws Exception {
}

while (mte.next() != null) {
String teString = mte.term().utf8ToString();
String teString = mte.decodedTerm();
assertTrue(expecteds.contains(teString));
expecteds.remove(teString);
}
Expand Down

0 comments on commit edc7a61

Please sign in to comment.