Skip to content

Commit

Permalink
Fix highlighting of overlapping terms in the unified highlighter (#47227
Browse files Browse the repository at this point in the history
)

The passage formatter that the unified highlighter use doesn't handle terms with overlapping offsets.
For tokenizer that provides multiple segmentation of the same terms (edge ngram for instance) the formatter
should select the largest span in order to highlight the term only once. This change implements this logic.
  • Loading branch information
jimczi committed Oct 2, 2019
1 parent 4f722f0 commit c340814
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ setup:
- match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
- match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
- match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> lazy dog"] }
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em> lazy dog"] }
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump</em> lazy dog"] }
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em> lazy dog"] }
- match: { hits.hits.0.highlight.a_field\._4gram: null }

Expand Down Expand Up @@ -197,6 +197,6 @@ setup:
- match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
- match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
- match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> <em>lazy</em> dog"] }
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em><em> lazy</em> dog"] }
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em><em> lazy</em> dog"] }
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump lazy</em> dog"] }
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump lazy</em> dog"] }
- match: { hits.hits.0.highlight.a_field\._4gram: ["quick <em>brown fox jump lazy</em> dog"] }
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,23 @@ public Snippet[] format(Passage[] passages, String content) {
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
assert start >= pos && start < passage.getEndOffset();
// append content before this start
append(sb, content, pos, start);

int end = passage.getMatchEnds()[i];
// its possible to have overlapping terms
if (start > pos) {
append(sb, content, pos, start);
}
if (end > pos) {
sb.append(preTag);
append(sb, content, Math.max(pos, start), end);
sb.append(postTag);
pos = end;
assert end > start;
// Look ahead to expand 'end' past all overlapping:
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
end = passage.getMatchEnds()[++i];
}
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

sb.append(preTag);
append(sb, content, start, end);
sb.append(postTag);

pos = end;
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package org.apache.lucene.search.uhighlight;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
Expand All @@ -33,6 +35,7 @@
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
Expand Down Expand Up @@ -240,4 +243,33 @@ public void testGroupSentences() throws Exception {
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
}

public void testOverlappingTerms() throws Exception {
final String[] inputs = {
"bro",
"brown",
"brownie",
"browser"
};
final String[] outputs = {
"<b>bro</b>",
"<b>brown</b>",
"<b>browni</b>e",
"<b>browser</b>"
};
BooleanQuery query = new BooleanQuery.Builder()
.add(new FuzzyQuery(new Term("text", "brow")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "b")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "br")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "bro")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "browni")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "browser")), BooleanClause.Occur.SHOULD)
.build();
Analyzer analyzer = CustomAnalyzer.builder()
.withTokenizer(EdgeNGramTokenizerFactory.class, "minGramSize", "1", "maxGramSize", "7")
.build();
assertHighlightOneDoc("text", inputs,
analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}

}

0 comments on commit c340814

Please sign in to comment.