elastic · jimczi · Oct 2, 2019 · Sep 27, 2019 · Sep 27, 2019 · Sep 30, 2019
diff --git a/...apper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml b/...apper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml
@@ -165,7 +165,7 @@ setup:
   - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> lazy dog"] }
-  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em> lazy dog"] }
+  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump</em> lazy dog"] }
   - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em> lazy dog"] }
   - match: { hits.hits.0.highlight.a_field\._4gram: null }
 
@@ -197,6 +197,6 @@ setup:
   - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
   - match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> <em>lazy</em> dog"] }
-  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em><em> lazy</em> dog"] }
-  - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em><em> lazy</em> dog"] }
+  - match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump lazy</em> dog"] }
+  - match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump lazy</em> dog"] }
   - match: { hits.hits.0.highlight.a_field\._4gram: ["quick <em>brown fox jump lazy</em> dog"] }
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java
@@ -49,17 +49,23 @@ public Snippet[] format(Passage[] passages, String content) {
             pos = passage.getStartOffset();
             for (int i = 0; i < passage.getNumMatches(); i++) {
                 int start = passage.getMatchStarts()[i];
+                assert start >= pos && start < passage.getEndOffset();
+                // append content before this start
+                append(sb, content, pos, start);
+
                 int end = passage.getMatchEnds()[i];
-                // its possible to have overlapping terms
-                if (start > pos) {
-                    append(sb, content, pos, start);
-                }
-                if (end > pos) {
-                    sb.append(preTag);
-                    append(sb, content, Math.max(pos, start), end);
-                    sb.append(postTag);
-                    pos = end;
+                assert end > start;
+                // Look ahead to expand 'end' past all overlapping:
+                while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
+                    end = passage.getMatchEnds()[++i];
                 }
+                end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
+
+                sb.append(preTag);
+                append(sb, content, start, end);
+                sb.append(postTag);
+
+                pos = end;
             }
             // its possible a "term" from the analyzer could span a sentence boundary.
             append(sb, content, pos, Math.max(pos, passage.getEndOffset()));

diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -20,6 +20,8 @@
 package org.apache.lucene.search.uhighlight;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -32,6 +34,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.PhraseQuery;
@@ -224,4 +227,33 @@ public void testGroupSentences() throws Exception {
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
     }
 
+    public void testOverlappingTerms() throws Exception {
+        final String[] inputs = {
+            "bro",
+            "brown",
+            "brownie",
+            "browser"
+        };
+        final String[] outputs = {
+            "<b>bro</b>",
+            "<b>brown</b>",
+            "<b>browni</b>e",
+            "<b>browser</b>"
+        };
+        BooleanQuery query = new BooleanQuery.Builder()
+            .add(new FuzzyQuery(new Term("text", "brow")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "b")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "br")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "bro")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "browni")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "browser")), BooleanClause.Occur.SHOULD)
+            .build();
+        Analyzer analyzer = CustomAnalyzer.builder()
+            .withTokenizer(EdgeNGramTokenizerFactory.class, "minGramSize", "1", "maxGramSize", "7")
+            .build();
+        assertHighlightOneDoc("text", inputs,
+            analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
+    }
+
 }