From c340814b344523647aa700a49158ac0dc947e06f Mon Sep 17 00:00:00 2001
From: Jim Ferenczi <jim.ferenczi@elastic.co>
Date: Wed, 2 Oct 2019 15:23:39 +0200
Subject: [PATCH] Fix highlighting of overlapping terms in the unified
 highlighter (#47227)

The passage formatter that the unified highlighter use doesn't handle terms with overlapping offsets.
For tokenizer that provides multiple segmentation of the same terms (edge ngram for instance) the formatter
should select the largest span in order to highlight the term only once. This change implements this logic.
---
 .../search-as-you-type/20_highlighting.yml    |  6 ++--
 .../uhighlight/CustomPassageFormatter.java    | 24 ++++++++------
 .../CustomUnifiedHighlighterTests.java        | 32 +++++++++++++++++++
 3 files changed, 50 insertions(+), 12 deletions(-)
diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml
index b09bc8418c98a..15778393959e5 100644
--- a/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml
+++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml
@@ -165,7 +165,7 @@ setup:
   - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> lazy dog"] }
-  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em> lazy dog"] }
+  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump</em> lazy dog"] }
   - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em> lazy dog"] }
   - match: { hits.hits.0.highlight.a_field\._4gram: null }
 
@@ -197,6 +197,6 @@ setup:
   - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> <em>lazy</em> dog"] }
-  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em><em> lazy</em> dog"] }
-  - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em><em> lazy</em> dog"] }
+  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump lazy</em> dog"] }
+  - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump lazy</em> dog"] }
   - match: { hits.hits.0.highlight.a_field\._4gram: ["quick <em>brown fox jump lazy</em> dog"] }
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java
index 52eee559c6888..723c30f10dc61 100644
--- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java
+++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java
@@ -49,17 +49,23 @@ public Snippet[] format(Passage[] passages, String content) {
             pos = passage.getStartOffset();
             for (int i = 0; i < passage.getNumMatches(); i++) {
                 int start = passage.getMatchStarts()[i];
+                assert start >= pos && start < passage.getEndOffset();
+                // append content before this start
+                append(sb, content, pos, start);
+
                 int end = passage.getMatchEnds()[i];
-                // its possible to have overlapping terms
-                if (start > pos) {
-                    append(sb, content, pos, start);
-                }
-                if (end > pos) {
-                    sb.append(preTag);
-                    append(sb, content, Math.max(pos, start), end);
-                    sb.append(postTag);
-                    pos = end;
+                assert end > start;
+                // Look ahead to expand 'end' past all overlapping:
+                while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
+                    end = passage.getMatchEnds()[++i];
                 }
+                end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
+
+                sb.append(preTag);
+                append(sb, content, start, end);
+                sb.append(postTag);
+
+                pos = end;
             }
             // its possible a "term" from the analyzer could span a sentence boundary.
             append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index 4e4b04d1ff19c..6abb67c3ea113 100644
--- a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -20,6 +20,8 @@
 package org.apache.lucene.search.uhighlight;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -33,6 +35,7 @@
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.PhraseQuery;
@@ -240,4 +243,33 @@ public void testGroupSentences() throws Exception {
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
     }
 
+    public void testOverlappingTerms() throws Exception {
+        final String[] inputs = {
+            "bro",
+            "brown",
+            "brownie",
+            "browser"
+        };
+        final String[] outputs = {
+            "<b>bro</b>",
+            "<b>brown</b>",
+            "<b>browni</b>e",
+            "<b>browser</b>"
+        };
+        BooleanQuery query = new BooleanQuery.Builder()
+            .add(new FuzzyQuery(new Term("text", "brow")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "b")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "br")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "bro")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "browni")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "browser")), BooleanClause.Occur.SHOULD)
+            .build();
+        Analyzer analyzer = CustomAnalyzer.builder()
+            .withTokenizer(EdgeNGramTokenizerFactory.class, "minGramSize", "1", "maxGramSize", "7")
+            .build();
+        assertHighlightOneDoc("text", inputs,
+            analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
+    }
+
 }