Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't use index_phrases on graph queries #44340

Merged
merged 1 commit into from
Jul 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ setup:
properties:
field:
type: text
phrase_field:
type: text
index_phrases: true

- do:
index:
Expand Down Expand Up @@ -204,3 +207,26 @@ setup:
- match: { hits.hits.2._id: "1" }
- match: { hits.hits.3._id: "8" }
- match: { hits.hits.4._id: "2" }

---
"index_phrases":

- do:
index:
index: test
id: 9
body:
phrase_field: "bar baz"
refresh: true

- do:
search:
rest_total_hits_as_int: true
body:
query:
match:
phrase_field:
query: bar baz
analyzer: lower_graph_syns
- match: { hits.total: 1 }

Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.shingle.FixedShingleFilter;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
Expand All @@ -44,8 +47,6 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
Expand Down Expand Up @@ -680,7 +681,10 @@ public IntervalsSource intervals(String text, int maxGaps, boolean ordered,
@Override
public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
String field = name();
if (indexPhrases && slop == 0 && hasGaps(stream) == false) {
// we can't use the index_phrases shortcut with slop, if there are gaps in the stream,
// or if the incoming token stream is the output of a token graph due to
// https://issues.apache.org/jira/browse/LUCENE-8916
if (indexPhrases && slop == 0 && hasGaps(stream) == false && stream.hasAttribute(BytesTermAttribute.class) == false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we also want to add stream.hasAttribute(BytesTermAttribute.class) condition for multiPhraseQuery below?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know much about graph token filters, but I wonder if another way to check that index_phrases is not used with synonyms is to modify hasGaps to check also that posIncAtt.getPositionIncrement() != 0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a workaround for an implementation detail, namely that we represent the paths through a token graph using BytesTermAttribute instead of CharTermAttribute - once LUCENE-8916 has been fixed then this workaround should stop working, so I think we probably should keep checking for the implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this for multiPhraseQuery, because the results of calling getFiniteStrings() on a token graph are always simple token streams with no stacked tokens. The bug appears because in MatchQuery.MatchQueryBuilder.analyzeGraphBoolean() we iterate through all possible paths in the graph and send them back through createFieldQuery(), which because the paths are all simple will end up delegating to either FieldType#termQuery() or FieldType#phraseQuery - multiPhraseQuery will never be encountered in this second loop, so there's no need to check for the attribute.

stream = new FixedShingleFilter(stream, 2);
field = field + FAST_PHRASE_SUFFIX;
}
Expand All @@ -693,6 +697,9 @@ public Query phraseQuery(TokenStream stream, int slop, boolean enablePosIncremen

stream.reset();
while (stream.incrementToken()) {
if (termAtt.getBytesRef() == null) {
throw new IllegalStateException("Null term while building phrase query");
}
if (enablePosIncrements) {
position += posIncrAtt.getPositionIncrement();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockSynonymAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.DocValuesType;
Expand All @@ -30,6 +33,8 @@
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
Expand Down Expand Up @@ -818,6 +823,28 @@ public void testFastPhraseMapping() throws IOException {
new Term("synfield._index_phrase", "motor dog")})
.build()));

// https://github.com/elastic/elasticsearch/issues/43976
CannedTokenStream cts = new CannedTokenStream(
new Token("foo", 1, 0, 2, 2),
new Token("bar", 0, 0, 2),
new Token("baz", 1, 0, 2)
);
Analyzer synonymAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(reader -> {}, cts);
}
};
matchQuery.setAnalyzer(synonymAnalyzer);
Query q7 = matchQuery.parse(MatchQuery.Type.BOOLEAN, "synfield", "foo");
assertThat(q7, is(new BooleanQuery.Builder().add(new BooleanQuery.Builder()
.add(new TermQuery(new Term("synfield", "foo")), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery.Builder()
.add(new Term("synfield", "bar"))
.add(new Term("synfield", "baz"))
.build(), BooleanClause.Occur.SHOULD)
.build(), BooleanClause.Occur.SHOULD).build()));

ParsedDocument doc = mapper.parse(new SourceToParse("test", "type", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
.startObject()
Expand Down