Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search performance - better caching logic for queries on wildcard field #76035

Merged
merged 7 commits into from
Aug 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
Expand All @@ -27,32 +30,64 @@
import java.util.Objects;

/**
* Query that runs an Automaton across all binary doc values.
* Expensive to run so normally used in conjunction with more selective query clauses.
* Query that runs an Automaton across all binary doc values (but only for docs that also
* match a provided approximation query which is key to getting good performance).
*/
public class AutomatonQueryOnBinaryDv extends Query {
public class BinaryDvConfirmedAutomatonQuery extends Query {

private final String field;
private final String matchPattern;
private final ByteRunAutomaton bytesMatcher;
private final Query approxQuery;

public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
public BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, Automaton automaton) {
this.approxQuery = approximation;
this.field = field;
this.matchPattern = matchPattern;
bytesMatcher = new ByteRunAutomaton(automaton);
}

private BinaryDvConfirmedAutomatonQuery(Query approximation, String field, String matchPattern, ByteRunAutomaton bytesMatcher) {
this.approxQuery = approximation;
this.field = field;
this.matchPattern = matchPattern;
this.bytesMatcher = bytesMatcher;
}

@Override
public Query rewrite(IndexReader reader) throws IOException {
Query approxRewrite = approxQuery.rewrite(reader);
if (approxQuery != approxRewrite ) {
return new BinaryDvConfirmedAutomatonQuery(approxRewrite, field, matchPattern, bytesMatcher);
}
return this;
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Weight approxWeight = approxQuery.createWeight(searcher, scoreMode, boost);

return new ConstantScoreWeight(this, boost) {

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
ByteArrayDataInput badi = new ByteArrayDataInput();
final BinaryDocValues values = DocValues.getBinary(context.reader(), field);
TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) {
Scorer approxScorer = approxWeight.scorer(context);
if (approxScorer == null) {
// No matches to be had
return null;
}
DocIdSetIterator approxDisi = approxScorer.iterator();
TwoPhaseIterator twoPhase = new TwoPhaseIterator(approxDisi) {
@Override
public boolean matches() throws IOException {
if (values.advanceExact(approxDisi.docID()) == false)
markharwood marked this conversation as resolved.
Show resolved Hide resolved
{
// Bug if we have an indexed value (i.e an approxQuery) but no doc value.
assert approxQuery instanceof MatchAllDocsQuery;
return false;
}
BytesRef arrayOfValues = values.binaryValue();
badi.reset(arrayOfValues.bytes);
badi.setPosition(arrayOfValues.offset);
Expand Down Expand Up @@ -93,14 +128,19 @@ public boolean equals(Object obj) {
if (sameClassAs(obj) == false) {
return false;
}
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
BinaryDvConfirmedAutomatonQuery other = (BinaryDvConfirmedAutomatonQuery) obj;
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern)
&& Objects.equals(bytesMatcher, other.bytesMatcher);
&& Objects.equals(bytesMatcher, other.bytesMatcher)
&& Objects.equals(approxQuery, other.approxQuery);
}

@Override
public int hashCode() {
return Objects.hash(classHash(), field, matchPattern, bytesMatcher);
return Objects.hash(classHash(), field, matchPattern, bytesMatcher, approxQuery);
}

Query getApproximationQuery() {
return approxQuery;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -330,19 +330,15 @@ public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean
Automaton automaton = caseInsensitive
? AutomatonQueries.toCaseInsensitiveWildcardAutomaton(new Term(name(), wildcardPattern), Integer.MAX_VALUE)
: WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton);
if (clauseCount > 0) {
// We can accelerate execution with the ngram query
BooleanQuery approxQuery = rewritten.build();
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return new ConstantScoreQuery(verifyingBuilder.build());
return new BinaryDvConfirmedAutomatonQuery(approxQuery, name(), wildcardPattern, automaton);
} else if (numWildcardChars == 0 || numWildcardStrings > 0) {
// We have no concrete characters and we're not a pure length query e.g. ???
return new DocValuesFieldExistsQuery(name());
}
return verifyingQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), wildcardPattern, automaton);

}

Expand All @@ -365,19 +361,15 @@ public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxD
}
RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
Automaton automaton = regex.toAutomaton(maxDeterminizedStates);
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, automaton);

// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
// clause which we can't accelerate at all and needs verification. Example would be ".."
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
return verifyingQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton);
}

// We can accelerate execution with the ngram query
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return verifyingBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton);
}

// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
Expand Down Expand Up @@ -745,16 +737,12 @@ public Query rangeQuery(
}
}
Automaton automaton = TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, automaton);

if (accelerationQuery == null) {
return slowQuery;
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
name(), lower + "-" + upper, automaton);
}

BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
return qBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(accelerationQuery, name(), lower + "-" + upper, automaton);
}

@Override
Expand All @@ -768,7 +756,6 @@ public Query fuzzyQuery(
) {
String searchTerm = BytesRefs.toString(value);
try {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
//The approximation query can have a prefix and any number of ngrams.
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();

Expand Down Expand Up @@ -824,9 +811,6 @@ public Query fuzzyQuery(
}

BooleanQuery ngramQ = approxBuilder.build();
if (ngramQ.clauses().size()>0) {
bqBuilder.add(ngramQ, Occur.MUST);
}

// Verification query
FuzzyQuery fq = new FuzzyQuery(
Expand All @@ -836,9 +820,12 @@ public Query fuzzyQuery(
maxExpansions,
transpositions
);
bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, fq.getAutomata().automaton), Occur.MUST);
if (ngramQ.clauses().size() == 0) {
return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
name(), searchTerm, fq.getAutomata().automaton);
}

return bqBuilder.build();
return new BinaryDvConfirmedAutomatonQuery(ngramQ, name(), searchTerm, fq.getAutomata().automaton);
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
Expand Down Expand Up @@ -522,8 +521,9 @@ public void testRegexAcceleration() throws IOException, ParseException {
String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
for (String regex : matchAllButVerifyTests) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT);
BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery)wildcardFieldQuery;
assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
q.getApproximationQuery() instanceof MatchAllDocsQuery);
}


Expand Down Expand Up @@ -575,17 +575,18 @@ public void testWildcardAcceleration() throws IOException, ParseException {
String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_CONTEXT);
testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
assertTrue(unwrapAnyConstantScore(wildcardFieldQuery) instanceof BooleanQuery);
assertTrue(unwrapAnyConstantScore(wildcardFieldQuery) instanceof BinaryDvConfirmedAutomatonQuery);
}

// TODO All these expressions have no acceleration at all and could be improved
String slowPatterns[] = { "??" };
for (String pattern : slowPatterns) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_CONTEXT);
wildcardFieldQuery = unwrapAnyConstantScore(wildcardFieldQuery);
BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery) wildcardFieldQuery;
assertTrue(
pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
q.getApproximationQuery() instanceof MatchAllDocsQuery
);
}

Expand All @@ -599,14 +600,17 @@ public void testQueryCachingEquality() throws IOException, ParseException {
new Term("field", pattern),
Integer.MAX_VALUE
);
AutomatonQueryOnBinaryDv csQ = new AutomatonQueryOnBinaryDv("field", pattern, caseSensitiveAutomaton);
AutomatonQueryOnBinaryDv ciQ = new AutomatonQueryOnBinaryDv("field", pattern, caseInSensitiveAutomaton);
BinaryDvConfirmedAutomatonQuery csQ = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseSensitiveAutomaton);
BinaryDvConfirmedAutomatonQuery ciQ = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseInSensitiveAutomaton);
assertNotEquals(csQ, ciQ);
assertNotEquals(csQ.hashCode(), ciQ.hashCode());

// Same query should be equal
Automaton caseSensitiveAutomaton2 = WildcardQuery.toAutomaton(new Term("field", pattern));
AutomatonQueryOnBinaryDv csQ2 = new AutomatonQueryOnBinaryDv("field", pattern, caseSensitiveAutomaton2);
BinaryDvConfirmedAutomatonQuery csQ2 = new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(),
"field", pattern, caseSensitiveAutomaton2);
assertEquals(csQ, csQ2);
assertEquals(csQ.hashCode(), csQ2.hashCode());
}
Expand Down Expand Up @@ -778,19 +782,8 @@ private Query unwrapAnyConstantScore(Query q) {
}

void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
BooleanQuery cq = (BooleanQuery) unwrapAnyConstantScore(combinedQuery);
assert cq.clauses().size() == 2;
Query approximationQuery = null;
boolean verifyQueryFound = false;
for (BooleanClause booleanClause : cq.clauses()) {
Query q = booleanClause.getQuery();
if (q instanceof AutomatonQueryOnBinaryDv) {
verifyQueryFound = true;
} else {
approximationQuery = q;
}
}
assert verifyQueryFound;
BinaryDvConfirmedAutomatonQuery cq = (BinaryDvConfirmedAutomatonQuery) unwrapAnyConstantScore(combinedQuery);
Query approximationQuery = cq.getApproximationQuery();

String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
"\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
Expand Down