Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly handle duplicates in unordered interval matching #49775

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ setup:
- '{"text" : "Baby its cold there outside"}'
- '{"index": {"_index": "test", "_id": "4"}}'
- '{"text" : "Outside it is cold and wet"}'
- '{"index": {"_index": "test", "_id": "5"}}'
- '{"text" : "To be or not to be, that is the question"}'
- '{"index": {"_index": "test", "_id": "6"}}'
- '{"text" : "Id like to be under the sea or not"}'

---
"Test ordered matching":
Expand Down Expand Up @@ -49,6 +53,22 @@ setup:
query: "cold outside"
- match: { hits.total.value: 3 }

---
"Test unordered matching with duplicates":
- skip:
version: "- 8.0.0"
reason: Added in v7.6
- do:
search:
index: test
body:
query:
intervals:
text:
match:
query: "to be or not to be"
- match: { hits.total.value: 1 }

---
"Test explicit unordered matching":
- do:
Expand Down Expand Up @@ -185,6 +205,27 @@ setup:
ordered: false
- match: { hits.total.value: 2 }

---
"Test unordered combination with duplicates":
- skip:
version: "- 8.0.0"
reason: Added in v7.6
- do:
search:
index: test
body:
query:
intervals:
text:
all_of:
intervals:
- match:
query: "to be"
- match:
query: "to be"
ordered: false
- match: { hits.total.value: 1 }

---
"Test block combination":
- do:
Expand Down
25 changes: 25 additions & 0 deletions server/src/main/java/org/apache/lucene/queries/XIntervals.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
Expand All @@ -48,6 +49,8 @@
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.function.ToIntFunction;
import java.util.stream.Collectors;

/**
* Replacement for {@link Intervals#wildcard(BytesRef)} and {@link Intervals#prefix(BytesRef)}
Expand All @@ -67,6 +70,28 @@ public static IntervalsSource prefix(BytesRef prefix) {
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
}

public static IntervalsSource maxWidth(IntervalsSource in, ToIntFunction<IntervalsSource> widthFunction) {
return Intervals.or(in.pullUpDisjunctions().stream()
.map(s -> new MaxWidth(s, widthFunction.applyAsInt(s)))
.collect(Collectors.toList()));
}

private static class MaxWidth extends FilteredIntervalsSource {

private final int maxWidth;

MaxWidth(IntervalsSource in, int maxWidth) {
super("MAXWIDTH/" + maxWidth, in);
this.maxWidth = maxWidth;
}

@Override
protected boolean accept(IntervalIterator it) {
return (it.end() - it.start()) + 1 <= maxWidth;
}

}

static class MultiTermIntervalsSource extends IntervalsSource {

private final CompiledAutomaton automaton;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;

Expand All @@ -40,7 +41,9 @@
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
* Constructs an IntervalsSource based on analyzed text
Expand Down Expand Up @@ -102,15 +105,15 @@ protected IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, bo
return analyzeTerm(stream);
} else if (isGraph) {
// graph
return combineSources(analyzeGraph(stream), maxGaps, ordered);
return combineSources(analyzeGraph(stream), maxGaps, ordered, true);
} else {
// phrase
if (hasSynonyms) {
// phrase with single-term synonyms
return analyzeSynonyms(stream, maxGaps, ordered);
} else {
// simple phrase
return combineSources(analyzeTerms(stream), maxGaps, ordered);
return combineSources(analyzeTerms(stream), maxGaps, ordered, true);
}
}

Expand All @@ -123,13 +126,16 @@ protected IntervalsSource analyzeTerm(TokenStream ts) throws IOException {
return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
}

protected static IntervalsSource combineSources(List<IntervalsSource> sources, int maxGaps, boolean ordered) {
protected static IntervalsSource combineSources(List<IntervalsSource> sources, int maxGaps, boolean ordered, boolean fixedWidth) {
if (sources.size() == 0) {
return NO_INTERVALS;
}
if (sources.size() == 1) {
return sources.get(0);
}
if (ordered == false) {
sources = deduplicate(sources);
}
IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]);
if (maxGaps == 0 && ordered) {
return Intervals.phrase(sourcesArray);
Expand All @@ -138,9 +144,36 @@ protected static IntervalsSource combineSources(List<IntervalsSource> sources, i
if (maxGaps == -1) {
return inner;
}
if (fixedWidth) {
return XIntervals.maxWidth(inner, s -> maxGaps + s.minExtent());
}
return Intervals.maxgaps(maxGaps, inner);
}

protected static List<IntervalsSource> deduplicate(List<IntervalsSource> sources) {
Map<IntervalsSource, Integer> counts = new LinkedHashMap<>(); // preserve order for testing
for (IntervalsSource source : sources) {
counts.compute(source, (k, v) -> v == null ? 1 : v + 1);
}
if (counts.size() == sources.size()) {
return sources;
}
sources = new ArrayList<>();
for (IntervalsSource source : counts.keySet()) {
int count = counts.get(source);
if (count == 1) {
sources.add(source);
} else {
IntervalsSource[] multiples = new IntervalsSource[count];
for (int i = 0; i < count; i++) {
multiples[i] = source;
}
sources.add(Intervals.ordered(multiples));
}
}
return sources;
}

protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
List<IntervalsSource> terms = new ArrayList<>();
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
Expand Down Expand Up @@ -189,7 +222,7 @@ else if (synonyms.size() > 1) {
else {
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
}
return combineSources(terms, maxGaps, ordered);
return combineSources(terms, maxGaps, ordered, true);
}

protected List<IntervalsSource> analyzeGraph(TokenStream source) throws IOException {
Expand All @@ -212,7 +245,7 @@ protected List<IntervalsSource> analyzeGraph(TokenStream source) throws IOExcept
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true);
IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true, true);
if (paths.size() >= maxClauseCount) {
throw new BooleanQuery.TooManyClauses();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ public IntervalsSource getSource(QueryShardContext ctx, MappedFieldType fieldTyp
for (IntervalsSourceProvider provider : subSources) {
ss.add(provider.getSource(ctx, fieldType));
}
IntervalsSource source = IntervalBuilder.combineSources(ss, maxGaps, ordered);
IntervalsSource source = IntervalBuilder.combineSources(ss, maxGaps, ordered, false);
if (filter != null) {
return filter.filter(source, ctx, fieldType);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,21 @@ public void testGraphSynonymsWithGaps() throws IOException {

assertEquals(expected, source);

source = BUILDER.analyzeText(new CachingTokenFilter(ts), 2, false);
expected = Intervals.or(
Intervals.maxwidth(6, Intervals.unordered(
Intervals.term("term1"), Intervals.extend(Intervals.term("term2"), 1, 0), Intervals.term("term5")
)),
Intervals.maxwidth(9, Intervals.unordered(
Intervals.term("term1"),
Intervals.phrase(
Intervals.extend(Intervals.term("term3"), 1, 0),
Intervals.extend(Intervals.term("term4"), 2, 0)),
Intervals.term("term5")
))
);
assertEquals(expected, source);

}

public void testGraphTerminatesOnGap() throws IOException {
Expand All @@ -222,4 +237,26 @@ public void testGraphTerminatesOnGap() throws IOException {
assertEquals(expected, source);
}

public void testUnorderedWithRepeats() throws IOException {
IntervalsSource source = BUILDER.analyzeText("to be or not to be", -1, false);
IntervalsSource expected = Intervals.unordered(
Intervals.ordered(Intervals.term("to"), Intervals.term("to")),
Intervals.ordered(Intervals.term("be"), Intervals.term("be")),
Intervals.term("or"),
Intervals.term("not")
);
assertEquals(expected, source);
}

public void testUnorderedWithRepeatsAndMaxGaps() throws IOException {
IntervalsSource source = BUILDER.analyzeText("to be or not to be", 2, false);
IntervalsSource expected = Intervals.maxwidth(8, Intervals.unordered(
Intervals.ordered(Intervals.term("to"), Intervals.term("to")),
Intervals.ordered(Intervals.term("be"), Intervals.term("be")),
Intervals.term("or"),
Intervals.term("not")
));
assertEquals(expected, source);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ public void testMatchInterval() throws IOException {

builder = (IntervalQueryBuilder) parseQuery(json);
expected = new IntervalQuery(STRING_FIELD_NAME,
Intervals.maxgaps(40, Intervals.unordered(Intervals.term("hello"), Intervals.term("world"))));
XIntervals.maxWidth(Intervals.unordered(Intervals.term("hello"), Intervals.term("world")), s -> 42));
assertEquals(expected, builder.toQuery(createShardContext()));

json = "{ \"intervals\" : " +
Expand All @@ -217,7 +217,7 @@ public void testMatchInterval() throws IOException {

builder = (IntervalQueryBuilder) parseQuery(json);
expected = new IntervalQuery(STRING_FIELD_NAME,
Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world"))));
XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12));
assertEquals(expected, builder.toQuery(createShardContext()));

json = "{ \"intervals\" : " +
Expand All @@ -232,7 +232,7 @@ public void testMatchInterval() throws IOException {
builder = (IntervalQueryBuilder) parseQuery(json);
expected = new IntervalQuery(STRING_FIELD_NAME,
Intervals.fixField(MASKED_FIELD,
Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")))));
XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12)));
assertEquals(expected, builder.toQuery(createShardContext()));

json = "{ \"intervals\" : " +
Expand All @@ -248,7 +248,7 @@ public void testMatchInterval() throws IOException {

builder = (IntervalQueryBuilder) parseQuery(json);
expected = new IntervalQuery(STRING_FIELD_NAME,
Intervals.containing(Intervals.maxgaps(10, Intervals.ordered(Intervals.term("Hello"), Intervals.term("world"))),
Intervals.containing(XIntervals.maxWidth(Intervals.ordered(Intervals.term("Hello"), Intervals.term("world")), s -> 12),
Intervals.term("blah")));
assertEquals(expected, builder.toQuery(createShardContext()));
}
Expand Down