Skip to content

Commit

Permalink
Add "search_after" support to new termsEnum api (#72933)
Browse files Browse the repository at this point in the history
Adds an optional parameter to the _terms_enum request designed to allow paging.
The last term from a previous result can be passed as the search_after parameter to a subsequent request, meaning only terms after the given term (but still matching the provided string prefix) are returned
Relates to #72910
  • Loading branch information
markharwood committed May 14, 2021
1 parent a5a20ae commit ebb113a
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 18 deletions.
7 changes: 7 additions & 0 deletions docs/reference/search/terms-enum.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,10 @@ Defaults to false.
(Optional, <<query-dsl,query object>> Allows to filter an index shard if the provided
query rewrites to `match_none`.

[[terms-enum-search_after-param]]
`string`::
(Optional, string)
The string after which terms in the index should be returned. Allows for a form of
pagination if the last result from one request is passed as the search_after
parameter for a subsequent request.

Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,20 @@
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.lucene.Lucene;
Expand Down Expand Up @@ -259,7 +261,8 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) {
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();

Terms terms = MultiTerms.getTerms(reader, name());
Expand All @@ -274,8 +277,38 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);

CompiledAutomaton automaton = new CompiledAutomaton(a);
return automaton.getTermsEnum(terms);
}

BytesRef searchBytes = searchAfter == null? null: new BytesRef(searchAfter);

if (automaton.type == AUTOMATON_TYPE.ALL) {
TermsEnum result = terms.iterator();
if (searchAfter != null) {
result = new SearchAfterTermsEnum(result, searchBytes);
}
return result;
}
return terms.intersect(automaton, searchBytes);
}

// Initialises with a seek to a given term but excludes that term
// from any results. The problem it addresses is that termsEnum.seekCeil()
// would work but either leaves us positioned on the seek term (if it exists) or the
// term after (if the seek term doesn't exist). That complicates any subsequent
// iteration logic so this class simplifies the pagination use case.
final class SearchAfterTermsEnum extends FilteredTermsEnum {
private final BytesRef afterRef;

SearchAfterTermsEnum(TermsEnum tenum, BytesRef termText) {
super(tenum);
afterRef = termText;
setInitialSeekTerm(termText);
}

@Override
protected AcceptStatus accept(BytesRef term) {
return term.equals(afterRef) ? AcceptStatus.NO : AcceptStatus.YES;
}
}

@Override
public String typeName() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,10 +440,12 @@ public enum CollapseType {
* @param caseInsensitive if matches should be case insensitive
* @param string the partially complete word the user has typed (can be empty)
* @param queryShardContext the shard context
* @param searchAfter - usually null. If supplied the TermsEnum result must be positioned after the provided term (used for pagination)
* @return null or an enumeration of matching terms and their doc frequencies
* @throws IOException Errors accessing data
*/
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ public Query termQueryCaseInsensitive(Object value, SearchExecutionContext conte
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
IndexReader reader = queryShardContext.searcher().getTopReaderContext().reader();
Terms terms = MultiTerms.getTerms(reader, name());
if (terms == null) {
Expand All @@ -274,8 +275,13 @@ public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutio
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);

CompiledAutomaton automaton = new CompiledAutomaton(a);
// Wrap result in a class that strips field names from discovered terms
return new TranslatingTermsEnum(automaton.getTermsEnum(terms));
if (searchAfter != null) {
BytesRef searchAfterWithFieldName = new BytesRef(key + FlattenedFieldParser.SEPARATOR + searchAfter);
TermsEnum seekedEnum = terms.intersect(automaton, searchAfterWithFieldName);
return new TranslatingTermsEnum(seekedEnum);
} else {
return new TranslatingTermsEnum(automaton.getTermsEnum(terms));
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class NodeTermsEnumRequest extends TransportRequest implements IndicesReq

private String field;
private String string;
private String searchAfter;
private long taskStartedTimeMillis;
private long nodeStartedTimeMillis;
private boolean caseInsensitive;
Expand All @@ -41,6 +42,7 @@ public NodeTermsEnumRequest(StreamInput in) throws IOException {
super(in);
field = in.readString();
string = in.readString();
searchAfter = in.readOptionalString();
caseInsensitive = in.readBoolean();
size = in.readVInt();
timeout = in.readVLong();
Expand All @@ -57,15 +59,14 @@ public NodeTermsEnumRequest(StreamInput in) throws IOException {
public NodeTermsEnumRequest(final String nodeId, final Set<ShardId> shardIds, TermsEnumRequest request) {
this.field = request.field();
this.string = request.string();
this.searchAfter = request.searchAfter();
this.caseInsensitive = request.caseInsensitive();
this.size = request.size();
this.timeout = request.timeout().getMillis();
this.taskStartedTimeMillis = request.taskStartTimeMillis;
this.indexFilter = request.indexFilter();
this.nodeId = nodeId;
this.shardIds = shardIds;

// TODO serialize shard ids
}

public String field() {
Expand All @@ -76,6 +77,10 @@ public String string() {
return string;
}

public String searchAfter() {
return searchAfter;
}

public long taskStartedTimeMillis() {
return this.taskStartedTimeMillis;
}
Expand Down Expand Up @@ -119,6 +124,7 @@ public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(field);
out.writeString(string);
out.writeOptionalString(searchAfter);
out.writeBoolean(caseInsensitive);
out.writeVInt(size);
// Adjust the amount of permitted time the shard has remaining to gather terms.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

/**
* A utility class for fields that need to support autocomplete via
* {@link MappedFieldType#getTerms(boolean, String, org.elasticsearch.index.query.SearchExecutionContext)}
* {@link MappedFieldType#getTerms(boolean, String, org.elasticsearch.index.query.SearchExecutionContext, String)}
* but can't return a raw Lucene TermsEnum.
*/
public class SimpleTermCountEnum extends TermsEnum {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public static TermsEnumRequest fromXContent(XContentParser parser, String... ind
static {
PARSER.declareString(TermsEnumRequest::field, new ParseField("field"));
PARSER.declareString(TermsEnumRequest::string, new ParseField("string"));
PARSER.declareString(TermsEnumRequest::searchAfter, new ParseField("search_after"));
PARSER.declareInt(TermsEnumRequest::size, new ParseField("size"));
PARSER.declareBoolean(TermsEnumRequest::caseInsensitive, new ParseField("case_insensitive"));
PARSER.declareField(TermsEnumRequest::timeout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public class TermsEnumRequest extends BroadcastRequest<TermsEnumRequest> impleme

private String field;
private String string;
private String searchAfter;
private int size = DEFAULT_SIZE;
private boolean caseInsensitive;
long taskStartTimeMillis;
Expand All @@ -44,6 +45,7 @@ public TermsEnumRequest(StreamInput in) throws IOException {
super(in);
field = in.readString();
string = in.readString();
searchAfter = in.readOptionalString();
caseInsensitive = in.readBoolean();
size = in.readVInt();
indexFilter = in.readOptionalNamedWriteable(QueryBuilder.class);
Expand Down Expand Up @@ -103,6 +105,20 @@ public void string(String string) {
public String string() {
return string;
}

/**
* The string after which to find matching field values (enables pagination of previous request)
*/
public String searchAfter() {
return searchAfter;
}

/**
* The string after which to find matching field values (enables pagination of previous request)
*/
public void searchAfter(String searchAfter) {
this.searchAfter = searchAfter;
}

/**
* The number of terms to return
Expand Down Expand Up @@ -148,6 +164,7 @@ public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(field);
out.writeString(string);
out.writeOptionalString(searchAfter);
out.writeBoolean(caseInsensitive);
out.writeVInt(size);
out.writeOptionalNamedWriteable(indexFilter);
Expand All @@ -157,14 +174,18 @@ public void writeTo(StreamOutput out) throws IOException {
public String toString() {
return "[" + Arrays.toString(indices) + "] field[" + field + "], string[" + string + "] " + " size=" + size + " timeout="
+ timeout().getMillis() + " case_insensitive="
+ caseInsensitive + " indexFilter = "+ indexFilter;
+ caseInsensitive + " indexFilter = "+ indexFilter +
" searchAfter[" + searchAfter + "]" ;
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
builder.field("field", field);
builder.field("string", string);
if (searchAfter != null) {
builder.field("search_after", searchAfter);
}
builder.field("size", size);
builder.field("timeout", timeout().getMillis());
builder.field("case_insensitive", caseInsensitive);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,23 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
);
final MappedFieldType mappedFieldType = indexShard.mapperService().fieldType(request.field());
if (mappedFieldType != null) {
TermsEnum terms = mappedFieldType.getTerms(request.caseInsensitive(), request.string(), queryShardContext);
TermsEnum terms = mappedFieldType.getTerms(
request.caseInsensitive(),
request.string(),
queryShardContext,
request.searchAfter()
);
if (terms != null) {
shardTermsEnums.add(terms);
}
}
}
if (shardTermsEnums.size() == 0) {
// No term enums available
return new NodeTermsEnumResponse(request.nodeId(), termsList, error, true);
}
MultiShardTermsEnum te = new MultiShardTermsEnum(shardTermsEnums.toArray(new TermsEnum[0]));


int shard_size = request.size();
// All the above prep might take a while - do a timer check now before we continue further.
Expand All @@ -330,7 +340,7 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
if (termsList.size() >= shard_size) {
break;
}
}
};

} catch (Exception e) {
error = ExceptionsHelper.stackTrace(e);
Expand All @@ -339,7 +349,7 @@ protected NodeTermsEnumResponse dataNodeOperation(NodeTermsEnumRequest request,
}
return new NodeTermsEnumResponse(request.nodeId(), termsList, error, true);
}

// TODO remove this so we can shift code to server module - write a separate Interceptor class to
// rewrite requests according to security rules
private boolean canAccess(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ public void testRestTermEnumAction() throws Exception {
final String content = "{"
+ "\"field\":\"a\", "
+ "\"string\":\"foo\", "
+ "\"search_after\":\"football\", "
+ "\"index_filter\":{\"bool\":{\"must\":{\"term\":{\"user\":\"kimchy\"}}}}}";

final RestRequest request = createRestRequest(content);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,20 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format)


@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext) throws IOException {
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
boolean matches = caseInsensitive ?
value.toLowerCase(Locale.ROOT).startsWith(string.toLowerCase(Locale.ROOT)) :
value.startsWith(string);
if (matches == false) {
return null;
}
if (searchAfter != null) {
if (searchAfter.compareTo(value) >= 0) {
// The constant value is before the searchAfter value so must be ignored
return null;
}
}
int docCount = queryShardContext.searcher().getIndexReader().maxDoc();
return new SimpleTermCountEnum(new TermCount(value, docCount));
}
Expand Down

0 comments on commit ebb113a

Please sign in to comment.