Skip to content

Commit

Permalink
Don't use index_phrases on graph queries (#44340)
Browse files Browse the repository at this point in the history
Due to https://issues.apache.org/jira/browse/LUCENE-8916, when you
try to use a synonym filter with the index_phrases option on a text field,
you can end up with null values in a Phrase query, leading to weird
exceptions further down the querying chain. As a workaround, this commit
disables the index_phrases optimization for queries that produce token
graphs.

Fixes #43976
  • Loading branch information
romseygeek committed Jul 17, 2019
1 parent ddd7401 commit b6a0f09
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ setup:
properties:
field:
type: text
phrase_field:
type: text
index_phrases: true

- do:
index:
Expand Down Expand Up @@ -204,3 +207,26 @@ setup:
- match: { hits.hits.2._id: "1" }
- match: { hits.hits.3._id: "8" }
- match: { hits.hits.4._id: "2" }

---
"index_phrases":

- do:
index:
index: test
id: 9
body:
phrase_field: "bar baz"
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
phrase_field:
query: bar baz
analyzer: lower_graph_syns
- match: { hits.total: 1 }

Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
Expand All @@ -44,8 +47,6 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
Expand Down Expand Up @@ -688,7 +689,10 @@ public IntervalsSource intervals(String text, int maxGaps, boolean ordered,
@Override
public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
String field = name();
if (indexPhrases && slop == 0 && hasGaps(stream) == false) {
// we can't use the index_phrases shortcut with slop, if there are gaps in the stream,
// or if the incoming token stream is the output of a token graph due to
// https://issues.apache.org/jira/browse/LUCENE-8916
if (indexPhrases && slop == 0 && hasGaps(stream) == false && stream.hasAttribute(BytesTermAttribute.class) == false) {
stream = new FixedShingleFilter(stream, 2);
field = field + FAST_PHRASE_SUFFIX;
}
Expand All @@ -701,6 +705,9 @@ public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncremen

stream.reset();
while (stream.incrementToken()) {
if (termAtt.getBytesRef() == null) {
throw new IllegalStateException("Null term while building phrase query");
}
if (enablePosIncrements) {
position += posIncrAtt.getPositionIncrement();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockSynonymAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.DocValuesType;
Expand All @@ -30,6 +33,8 @@
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
Expand Down Expand Up @@ -831,6 +836,28 @@ public void testFastPhraseMapping() throws IOException {
new Term("synfield._index_phrase", "motor dog")})
.build()));

// https://github.com/elastic/elasticsearch/issues/43976
CannedTokenStream cts = new CannedTokenStream(
new Token("foo", 1, 0, 2, 2),
new Token("bar", 0, 0, 2),
new Token("baz", 1, 0, 2)
);
Analyzer synonymAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(reader -> {}, cts);
}
};
matchQuery.setAnalyzer(synonymAnalyzer);
Query q7 = matchQuery.parse(MatchQuery.Type.BOOLEAN, "synfield", "foo");
assertThat(q7, is(new BooleanQuery.Builder().add(new BooleanQuery.Builder()
.add(new TermQuery(new Term("synfield", "foo")), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery.Builder()
.add(new Term("synfield", "bar"))
.add(new Term("synfield", "baz"))
.build(), BooleanClause.Occur.SHOULD)
.build(), BooleanClause.Occur.SHOULD).build()));

ParsedDocument doc = mapper.parse(new SourceToParse("test", "type", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
Expand Down

0 comments on commit b6a0f09

Please sign in to comment.