Skip to content

Commit

Permalink
Fix some query extraction bugs. (#29283)
Browse files Browse the repository at this point in the history
While playing with the percolator I found two bugs:
 - Sometimes we set a min_should_match that is greater than the number of
   extractions. While this doesn't cause direct trouble, it does when the query
   is nested into a boolean query and the boolean query tries to compute the
   min_should_match for the entire query based on its own min_should_match and
   those of the sub queries. So I changed the code to throw an exception when
   min_should_match is greater than the number of extractions.
 - Boolean queries claim matches are verified when in fact they shouldn't. This
   is due to the fact that boolean queries assume that they are verified if all
   sub clauses are verified but things are more complex than that, eg.
   conjunctions that are nested in a disjunction or disjunctions that are nested
   in a conjunction can generally not be verified without running the query.
  • Loading branch information
jpountz committed Apr 3, 2018
1 parent dc1c169 commit 8cdd950
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ static Result analyze(Query query, Version indexVersion) {
}

private static BiFunction<Query, Version, Result> matchNoDocsQuery() {
return (query, version) -> new Result(true, Collections.emptySet(), 1);
return (query, version) -> new Result(true, Collections.emptySet(), 0);
}

private static BiFunction<Query, Version, Result> matchAllDocsQuery() {
Expand Down Expand Up @@ -179,36 +179,36 @@ private static BiFunction<Query, Version, Result> termInSetQuery() {
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
terms.add(new QueryExtraction(new Term(iterator.field(), term)));
}
return new Result(true, terms, 1);
return new Result(true, terms, Math.min(1, terms.size()));
};
}

private static BiFunction<Query, Version, Result> synonymQuery() {
return (query, version) -> {
Set<QueryExtraction> terms = ((SynonymQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet());
return new Result(true, terms, 1);
return new Result(true, terms, Math.min(1, terms.size()));
};
}

private static BiFunction<Query, Version, Result> commonTermsQuery() {
return (query, version) -> {
Set<QueryExtraction> terms = ((CommonTermsQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet());
return new Result(false, terms, 1);
return new Result(false, terms, Math.min(1, terms.size()));
};
}

private static BiFunction<Query, Version, Result> blendedTermQuery() {
return (query, version) -> {
Set<QueryExtraction> terms = ((BlendedTermQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet());
return new Result(true, terms, 1);
return new Result(true, terms, Math.min(1, terms.size()));
};
}

private static BiFunction<Query, Version, Result> phraseQuery() {
return (query, version) -> {
Term[] terms = ((PhraseQuery) query).getTerms();
if (terms.length == 0) {
return new Result(true, Collections.emptySet(), 1);
return new Result(true, Collections.emptySet(), 0);
}

if (version.onOrAfter(Version.V_6_1_0)) {
Expand All @@ -232,7 +232,7 @@ private static BiFunction<Query, Version, Result> multiPhraseQuery() {
return (query, version) -> {
Term[][] terms = ((MultiPhraseQuery) query).getTermArrays();
if (terms.length == 0) {
return new Result(true, Collections.emptySet(), 1);
return new Result(true, Collections.emptySet(), 0);
}

if (version.onOrAfter(Version.V_6_1_0)) {
Expand Down Expand Up @@ -297,7 +297,7 @@ private static BiFunction<Query, Version, Result> spanOrQuery() {
for (SpanQuery clause : spanOrQuery.getClauses()) {
terms.addAll(analyze(clause, version).extractions);
}
return new Result(false, terms, 1);
return new Result(false, terms, Math.min(1, terms.size()));
};
}

Expand Down Expand Up @@ -334,6 +334,9 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
numOptionalClauses++;
}
}
if (minimumShouldMatch > numOptionalClauses) {
return new Result(false, Collections.emptySet(), 0);
}
if (numRequiredClauses > 0) {
if (version.onOrAfter(Version.V_6_1_0)) {
UnsupportedQueryException uqe = null;
Expand All @@ -345,7 +348,12 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
// since they are completely optional.

try {
results.add(analyze(clause.getQuery(), version));
Result subResult = analyze(clause.getQuery(), version);
if (subResult.matchAllDocs == false && subResult.extractions.isEmpty()) {
// doesn't match anything
return subResult;
}
results.add(subResult);
} catch (UnsupportedQueryException e) {
uqe = e;
}
Expand Down Expand Up @@ -400,7 +408,11 @@ private static BiFunction<Query, Version, Result> booleanQuery() {
}
msm += resultMsm;

verified &= result.verified;
if (result.verified == false
// If some inner extractions are optional, the result can't be verified
|| result.minimumShouldMatch < result.extractions.size()) {
verified = false;
}
matchAllDocs &= result.matchAllDocs;
extractions.addAll(result.extractions);
}
Expand Down Expand Up @@ -492,7 +504,7 @@ private static BiFunction<Query, Version, Result> pointRangeQuery() {
// Need to check whether upper is not smaller than lower, otherwise NumericUtils.subtract(...) fails IAE
// If upper is really smaller than lower then we deal with like MatchNoDocsQuery. (verified and no extractions)
if (new BytesRef(lowerPoint).compareTo(new BytesRef(upperPoint)) > 0) {
return new Result(true, Collections.emptySet(), 1);
return new Result(true, Collections.emptySet(), 0);
}

byte[] interval = new byte[16];
Expand Down Expand Up @@ -537,7 +549,15 @@ private static Result handleDisjunction(List<Query> disjunctions, int requiredSh
for (int i = 0; i < disjunctions.size(); i++) {
Query disjunct = disjunctions.get(i);
Result subResult = analyze(disjunct, version);
verified &= subResult.verified;
if (subResult.verified == false
// one of the sub queries requires more than one term to match, we can't
// verify it with a single top-level min_should_match
|| subResult.minimumShouldMatch > 1
// One of the inner clauses has multiple extractions, we won't be able to
// verify it with a single top-level min_should_match
|| (subResult.extractions.size() > 1 && requiredShouldClauses > 1)) {
verified = false;
}
if (subResult.matchAllDocs) {
numMatchAllClauses++;
}
Expand Down Expand Up @@ -683,6 +703,10 @@ static class Result {
final boolean matchAllDocs;

Result(boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
if (minimumShouldMatch > extractions.size()) {
throw new IllegalArgumentException("minimumShouldMatch can't be greater than the number of extractions: "
+ minimumShouldMatch + " > " + extractions.size());
}
this.extractions = extractions;
this.verified = verified;
this.minimumShouldMatch = minimumShouldMatch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,13 @@ public void testDuel() throws Exception {
new BytesRef(randomFrom(stringContent.get(field1)))));
queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(stringContent.get(field1))),
new BytesRef(randomFrom(stringContent.get(field1)))));
int numRandomBoolQueries = randomIntBetween(16, 32);
// many iterations with boolean queries, which are the most complex queries to deal with when nested
int numRandomBoolQueries = 1000;
for (int i = 0; i < numRandomBoolQueries; i++) {
queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues));
}
queryFunctions.add(() -> {
int numClauses = randomIntBetween(1, 16);
int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4));
List<Query> clauses = new ArrayList<>();
for (int i = 0; i < numClauses; i++) {
String field = randomFrom(stringFields);
Expand Down Expand Up @@ -266,7 +267,7 @@ public void testDuel() throws Exception {
private BooleanQuery createRandomBooleanQuery(int depth, List<String> fields, Map<String, List<String>> content,
MappedFieldType intFieldType, List<Integer> intValues) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
int numClauses = randomIntBetween(1, 16);
int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4)); // use low numbers of clauses more often
int numShouldClauses = 0;
boolean onlyShouldClauses = rarely();
for (int i = 0; i < numClauses; i++) {
Expand Down Expand Up @@ -313,7 +314,7 @@ private BooleanQuery createRandomBooleanQuery(int depth, List<String> fields, Ma
numShouldClauses++;
}
}
builder.setMinimumNumberShouldMatch(numShouldClauses);
builder.setMinimumNumberShouldMatch(randomIntBetween(0, numShouldClauses));
return builder.build();
}

Expand Down
Loading

0 comments on commit 8cdd950

Please sign in to comment.