From dd1b2a824c94b93c401782705f3f1e746b5031fd Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Fri, 5 Dec 2025 10:28:19 +0100 Subject: [PATCH 1/3] Normalize the title and description we return over fetch(), fix highlighting --- .../SearchResults/SearchResultsListItem.tsx | 16 +- .../SearchOrAskAi/Search/useSearchQuery.ts | 2 - .../Search/SearchUsecase.cs | 4 - .../Adapters/Search/ElasticsearchGateway.cs | 53 +-- .../Search/StringHighlightExtensions.cs | 137 ++++++ .../SearchRelevanceTests.cs | 2 + .../Search/StringHighlightExtensionsTests.cs | 434 ++++++++++++++++++ 7 files changed, 601 insertions(+), 47 deletions(-) create mode 100644 src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs create mode 100644 tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs diff --git a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/SearchResults/SearchResultsListItem.tsx b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/SearchResults/SearchResultsListItem.tsx index 1e9df8a8b..af425ff18 100644 --- a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/SearchResults/SearchResultsListItem.tsx +++ b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/SearchResults/SearchResultsListItem.tsx @@ -143,9 +143,7 @@ export function SearchResultListItem({ `} > @@ -164,14 +162,10 @@ export function SearchResultListItem({ //width: 90%; `} > - {result.highlightedBody ? ( - - ) : ( - {result.description} - )} + {result.parents.length > 0 && ( diff --git a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts index 424f645c0..b1beae705 100644 --- a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts +++ b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts @@ -35,8 +35,6 @@ const SearchResultItem = z.object({ description: z.string(), score: z.number(), parents: z.array(SearchResultItemParent), - highlightedTitle: z.string().nullish(), - highlightedBody: z.string().nullish(), }) export type SearchResultItem = z.infer diff --git a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs index 38f492dca..5b4834a91 100644 --- a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs +++ b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs @@ -82,9 +82,5 @@ public record SearchResultItem public required string Title { get; init; } public required string Description { get; init; } public required SearchResultItemParent[] Parents { get; init; } - public string[]? Headings { get; init; } public float Score { get; init; } - public string? HighlightedBody { get; init; } - - public string? HighlightedTitle { get; init; } } diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index be7c6befc..c693b82e4 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -276,18 +276,13 @@ public async Task SearchImplementation(string query, int pageNumbe ) ) .Highlight(h => h - .RequireFieldMatch(true) .Fields(f => f - .Add(Infer.Field(d => d.SearchTitle.Suffix("completion")), hf => hf + .Add(Infer.Field(d => d.Title), hf => hf .FragmentSize(150) .NumberOfFragments(3) .NoMatchSize(150) - .BoundaryChars(":.!?\t\n") - .BoundaryScanner(BoundaryScanner.Sentence) - .BoundaryMaxScan(15) - .FragmentOffset(0) .HighlightQuery(q => q.Match(m => m - .Field(d => d.SearchTitle.Suffix("completion")) + .Field(d => d.Title) .Query(searchQuery) .Analyzer("highlight_analyzer") )) @@ -297,15 +292,6 @@ public async Task SearchImplementation(string query, int pageNumbe .FragmentSize(150) .NumberOfFragments(3) .NoMatchSize(150) - .BoundaryChars(":.!?\t\n") - .BoundaryScanner(BoundaryScanner.Sentence) - .BoundaryMaxScan(15) - .FragmentOffset(0) - .HighlightQuery(q => q.Match(m => m - .Field(d => d.StrippedBody) - .Query(searchQuery) - .Analyzer("highlight_analyzer") - )) .PreTags(preTag) .PostTags(postTag)) ) @@ -324,7 +310,7 @@ public async Task SearchImplementation(string query, int pageNumbe else _logger.LogInformation("RRF search completed for '{Query}'. Total hits: {TotalHits}", query, response.Total); - return ProcessSearchResponse(response); + return ProcessSearchResponse(response, searchQuery); } catch (Exception ex) { @@ -333,9 +319,10 @@ public async Task SearchImplementation(string query, int pageNumbe } } - private static SearchResult ProcessSearchResponse(SearchResponse response) + private static SearchResult ProcessSearchResponse(SearchResponse response, string searchQuery) { var totalHits = (int)response.Total; + var searchTokens = searchQuery.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); var results = response.Documents.Select((doc, index) => { @@ -348,36 +335,42 @@ private static SearchResult ProcessSearchResponse(SearchResponse 0) - highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.TrimEnd('.', ' ', '-'))); + highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.Trim(['|', ' ', '.', '-']))); - if (highlights.TryGetValue("search_title.completion", out var titleHighlights) && titleHighlights.Count > 0) - highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.TrimEnd('.', ' ', '-'))); + if (highlights.TryGetValue("title", out var titleHighlights) && titleHighlights.Count > 0) + highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.Trim(['|', ' ', '.', '-']))); } + var title = (highlightedTitle ?? doc.Title).HighlightTokens(searchTokens); + var description = (!string.IsNullOrWhiteSpace(highlightedBody) ? highlightedBody : doc.Description ?? string.Empty) + .Replace("\r\n", " ") + .Replace("\n", " ") + .Replace("\r", " ") + .Trim(['|', ' ']) + .HighlightTokens(searchTokens); + return new SearchResultItem { Url = doc.Url, - Title = doc.Title, + Title = title, Type = doc.Type, - Description = doc.Description ?? string.Empty, - Headings = doc.Headings, + Description = description, Parents = doc.Parents.Select(parent => new SearchResultItemParent { Title = parent.Title, Url = parent.Url }).ToArray(), - Score = (float)(hit?.Score ?? 0.0), - HighlightedTitle = highlightedTitle, - HighlightedBody = highlightedBody + Score = (float)(hit?.Score ?? 0.0) }; }).ToList(); // Extract aggregations var aggregations = new Dictionary(); - if (response.Aggregations?.TryGetValue("type", out var typeAgg) == true && typeAgg is StringTermsAggregate stringTermsAgg) + var terms = response.Aggregations?.GetStringTerms("type"); + if (terms is not null) { - foreach (var bucket in stringTermsAgg.Buckets) - aggregations[bucket.Key.ToString()!] = bucket.DocCount; + foreach (var bucket in terms.Buckets) + aggregations[bucket.Key.ToString()] = bucket.DocCount; } return new SearchResult diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs new file mode 100644 index 000000000..ff557a5e9 --- /dev/null +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs @@ -0,0 +1,137 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Text; + +namespace Elastic.Documentation.Api.Infrastructure.Adapters.Search; + +public static class StringHighlightExtensions +{ + private const string MarkOpen = ""; + private const string MarkClose = ""; + + /// + /// Highlights search tokens in text by wrapping them with <mark> tags. + /// Skips tokens that are already highlighted or are inside existing mark tags. + /// + /// The text to highlight tokens in + /// The search tokens to highlight + /// Text with highlighted tokens + public static string HighlightTokens(this string text, ReadOnlySpan tokens) + { + if (tokens.Length == 0 || string.IsNullOrEmpty(text)) + return text; + + var result = text; + + foreach (var token in tokens) + { + if (string.IsNullOrEmpty(token)) + continue; + + result = HighlightSingleToken(result, token); + } + + return result; + } + + private static string HighlightSingleToken(string text, string token) + { + // Check if this exact token is already fully highlighted somewhere + // This prevents double-highlighting + if (text.Contains($"{MarkOpen}{token}{MarkClose}", StringComparison.OrdinalIgnoreCase)) + return text; + + var sb = new StringBuilder(text.Length + 26); // Room for a couple of mark tags + var textSpan = text.AsSpan(); + var tokenSpan = token.AsSpan(); + var pos = 0; + + while (pos < textSpan.Length) + { + var remaining = textSpan[pos..]; + var matchIndex = remaining.IndexOf(tokenSpan, StringComparison.OrdinalIgnoreCase); + + if (matchIndex < 0) + { + // No more matches, append rest and exit + _ = sb.Append(remaining); + break; + } + + var absoluteIndex = pos + matchIndex; + + // Check if we're inside mark tag syntax or inside mark tag content + if (IsInsideMarkTagSyntax(textSpan, absoluteIndex, tokenSpan.Length) || IsInsideMarkTagContent(textSpan, absoluteIndex)) + { + // Append up to and including this match without highlighting + _ = sb.Append(remaining[..(matchIndex + tokenSpan.Length)]); + pos = absoluteIndex + token.Length; + continue; + } + + // Append text before match, then highlighted token (preserving original case) + _ = sb.Append(remaining[..matchIndex]) + .Append(MarkOpen) + .Append(remaining.Slice(matchIndex, tokenSpan.Length)) + .Append(MarkClose); + + pos = absoluteIndex + token.Length; + } + + return sb.ToString(); + } + + private static bool IsInsideMarkTagSyntax(ReadOnlySpan text, int position, int tokenLength) + { + // Check if the match position overlaps with or tag syntax + // We want to protect the literal tag strings, not arbitrary HTML + + var matchEnd = position + tokenLength; + + // Look for that contains our position + var searchStart = Math.Max(0, position - 5); // is 6 chars, so look back 5 + var searchEnd = Math.Min(text.Length, matchEnd + 6); + var searchRegion = text[searchStart..searchEnd]; + + var markOpenIdx = searchRegion.IndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase); + if (markOpenIdx >= 0) + { + var absoluteMarkStart = searchStart + markOpenIdx; + var absoluteMarkEnd = absoluteMarkStart + MarkOpen.Length; + // Check if our match overlaps with this tag + if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart) + return true; + } + + // Look for that contains our position + searchStart = Math.Max(0, position - 6); // is 7 chars + searchEnd = Math.Min(text.Length, matchEnd + 7); + searchRegion = text[searchStart..searchEnd]; + + var markCloseIdx = searchRegion.IndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase); + if (markCloseIdx >= 0) + { + var absoluteMarkStart = searchStart + markCloseIdx; + var absoluteMarkEnd = absoluteMarkStart + MarkClose.Length; + // Check if our match overlaps with this tag + if (position < absoluteMarkEnd && matchEnd > absoluteMarkStart) + return true; + } + + return false; + } + + private static bool IsInsideMarkTagContent(ReadOnlySpan text, int position) + { + // Look backwards from position to find the last or + var beforePosition = text[..position]; + + var lastOpen = beforePosition.LastIndexOf(MarkOpen.AsSpan(), StringComparison.OrdinalIgnoreCase); + var lastClose = beforePosition.LastIndexOf(MarkClose.AsSpan(), StringComparison.OrdinalIgnoreCase); + + // If we found an opening tag after the last closing tag, we're inside a mark's content + return lastOpen > lastClose; + } +} diff --git a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs index b2a470b7a..a6a98d800 100644 --- a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs +++ b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs @@ -64,6 +64,8 @@ public class SearchRelevanceTests(ITestOutputHelper output) { "universal profiling", "/docs/solutions/observability/infra-and-hosts/universal-profiling", null}, { "agg", "/docs/explore-analyze/query-filter/aggregations", null}, { "a", "/docs/reference/apm/observability/apm", null}, + { "index.number_of_replicas", "/docs/reference/elasticsearch/index-settings/index-modules", null}, + //{ "index.use_time_series_doc_values_format", "/docs/reference/elasticsearch/index-settings/index-modules", null}, //universal profiling }; diff --git a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs new file mode 100644 index 000000000..760d5323d --- /dev/null +++ b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs @@ -0,0 +1,434 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using Elastic.Documentation.Api.Infrastructure.Adapters.Search; +using FluentAssertions; +using Xunit; + +namespace Elastic.Documentation.Api.Infrastructure.Tests.Adapters.Search; + +public class StringHighlightExtensionsTests +{ + [Fact] + public void EmptyTokensReturnsOriginalText() + { + var text = "Hello world"; + var result = text.HighlightTokens([]); + + result.Should().Be(text); + } + + [Fact] + public void EmptyTextReturnsEmptyString() + { + var result = "".HighlightTokens(["test"]); + + result.Should().BeEmpty(); + } + + [Fact] + public void NullTextReturnsNull() + { + string? text = null; + var result = text!.HighlightTokens(["test"]); + + result.Should().BeNull(); + } + + [Fact] + public void SingleTokenHighlightsMatch() + { + var text = "Hello world"; + var result = text.HighlightTokens(["world"]); + + result.Should().Be("Hello world"); + } + + [Fact] + public void SingleTokenHighlightsFirstCharacter() + { + var text = "Aggregations are useful"; + var result = text.HighlightTokens(["Ag"]); + + result.Should().Be("Aggregations are useful"); + } + + [Fact] + public void SingleTokenCaseInsensitiveMatch() + { + var text = "Hello WORLD"; + var result = text.HighlightTokens(["world"]); + + result.Should().Be("Hello WORLD"); + } + + [Fact] + public void SingleTokenPreservesOriginalCase() + { + var text = "Hello WoRlD"; + var result = text.HighlightTokens(["world"]); + + result.Should().Be("Hello WoRlD"); + } + + [Fact] + public void SingleTokenMultipleOccurrences() + { + var text = "test one test two test"; + var result = text.HighlightTokens(["test"]); + + result.Should().Be("test one test two test"); + } + + [Fact] + public void MultipleTokensHighlightsAll() + { + var text = "Hello world from here"; + var result = text.HighlightTokens(["hello", "world"]); + + result.Should().Be("Hello world from here"); + } + + [Fact] + public void AlreadyHighlightedTokenSkipsDoubleHighlighting() + { + var text = "Hello world again"; + var result = text.HighlightTokens(["world"]); + + result.Should().Be("Hello world again"); + } + + [Fact] + public void TokenInsideMarkTagNotHighlighted() + { + var text = "hello world and world outside"; + var result = text.HighlightTokens(["world"]); + + result.Should().Be("hello world and world outside"); + } + + [Fact] + public void SingleCharTokensHighlighted() + { + var text = "a b c test"; + var result = text.HighlightTokens(["a", "b", "test"]); + + result.Should().Be("a b c test"); + } + + [Fact] + public void TokenNotFoundReturnsOriginal() + { + var text = "Hello world"; + var result = text.HighlightTokens(["notfound"]); + + result.Should().Be(text); + } + + [Fact] + public void MixedHighlightedAndUnhighlighted() + { + var text = "elasticsearch documentation"; + var result = text.HighlightTokens(["elastic", "documentation"]); + + result.Should().Be("elasticsearch documentation"); + } + + [Fact] + public void TokenAtStartOfText() + { + var text = "Elasticsearch is great"; + var result = text.HighlightTokens(["Elasticsearch"]); + + result.Should().Be("Elasticsearch is great"); + } + + [Fact] + public void TokenAtEndOfText() + { + var text = "Search with Elasticsearch"; + var result = text.HighlightTokens(["Elasticsearch"]); + + result.Should().Be("Search with Elasticsearch"); + } + + [Fact] + public void PartialTokenInsideExistingMarkNotDoubleHighlighted() + { + var text = "dotnet is a framework"; + var result = text.HighlightTokens(["net"]); + + // "net" inside dotnet should not be highlighted + result.Should().Be("dotnet is a framework"); + } + + [Fact] + public void ConsecutiveTokensWithoutSpace() + { + var text = "HelloWorld"; + var result = text.HighlightTokens(["Hello", "World"]); + + result.Should().Be("HelloWorld"); + } + + [Fact] + public void OverlappingTokensFirstWins() + { + var text = "testing"; + var result = text.HighlightTokens(["test", "sting"]); + + // "test" gets highlighted first, then "sting" check finds "ting" but "s" is outside the mark + result.Should().Contain("test"); + } + + [Fact] + public void SpecialCharactersInTextHandledCorrectly() + { + var text = "C# and .NET framework"; + var result = text.HighlightTokens(["NET"]); + + result.Should().Be("C# and .NET framework"); + } + + [Fact] + public void MultipleMarksWithTokenBetween() + { + var text = "first middle last"; + var result = text.HighlightTokens(["middle"]); + + result.Should().Be("first middle last"); + } + + [Fact] + public void EmptyTokenInArrayIgnored() + { + var text = "Hello world"; + var result = text.HighlightTokens(["", "world", null!]); + + result.Should().Be("Hello world"); + } + + [Fact] + public void TokenMatchingMarkTagNotBroken() + { + // Edge case: what if someone searches for "mark"? + var text = "The mark element is used for highlighting"; + var result = text.HighlightTokens(["mark"]); + + result.Should().Be("The mark element is used for highlighting"); + } + + [Fact] + public void NestedMarkTagsHandledCorrectly() + { + // This shouldn't happen in practice but let's make sure we don't break + var text = "outer inner outer"; + var result = text.HighlightTokens(["test"]); + + result.Should().Be(text); + } + + [Fact] + public void LongTextWithManyMatchesPerformsWell() + { + var text = string.Join(" ", Enumerable.Repeat("elasticsearch kibana logstash beats", 100)); + var result = text.HighlightTokens(["elasticsearch", "kibana"]); + + result.Should().Contain("elasticsearch"); + result.Should().Contain("kibana"); + result.Should().NotContain("logstash"); + } + + [Fact] + public void UnicodeTextHandledCorrectly() + { + var text = "日本語 elasticsearch テスト"; + var result = text.HighlightTokens(["elasticsearch"]); + + result.Should().Be("日本語 elasticsearch テスト"); + } + + [Fact] + public void TokenWithUppercaseMarkStillWorks() + { + var text = "Hello world test"; + var result = text.HighlightTokens(["world"]); + + // The existing mark is uppercase, should still be detected + result.Should().Be("Hello world test"); + } + + [Fact] + public void RealWorldExampleSearchResults() + { + var text = "Elasticsearch is a distributed, RESTful search and analytics engine"; + var result = text.HighlightTokens(["elasticsearch", "search"]); + + result.Should().Be("Elasticsearch is a distributed, RESTful search and analytics engine"); + } + + [Fact] + public void RealWorldExamplePartiallyHighlighted() + { + var text = "Learn about Elasticsearch and how to use search effectively"; + var result = text.HighlightTokens(["elasticsearch", "search"]); + + result.Should().Be("Learn about Elasticsearch and how to use search effectively"); + } + + [Fact] + public void StartOfStringHighlight() + { + var text = "APM Architecture for AWS Lambda"; + var result = text.HighlightTokens(["apm", "ar"]); + + result.Should().Be("APM Architecture for AWS Lambda"); + } + + [Fact] + public void StartOfStringHighlight2() + { + var text = "APM Architecture for AWS Lambda"; + var result = text.HighlightTokens(["a"]); + + result.Should().Be("APM Architecture for AWS Lambda"); + } + + [Fact] + public void IgnoreOtherHtml() + { + var text = "<>APM<> Architecture for AWS Lambda"; + var result = text.HighlightTokens(["apm"]); + + result.Should().Be("<>APM<> Architecture for AWS Lambda"); + } + + [Fact] + public void HighlightInsideNonMarkHtml() + { + // Only tags are protected, other HTML tags get their content highlighted + var text = " Architecture for AWS Lambda"; + var result = text.HighlightTokens(["apm"]); + + result.Should().Be("<APM> Architecture for AWS Lambda"); + } + + [Fact] + public void PartiallyHighlightedTitleHighlightsRemaining() + { + var text = "Elasticsearch cluster management"; + var result = text.HighlightTokens(["search", "cluster"]); + + result.Should().Be("Elasticsearch cluster management"); + } + + [Fact] + public void PartiallyHighlightedMiddleHighlightsAround() + { + var text = "Learn Elasticsearch basics today"; + var result = text.HighlightTokens(["learn", "basics", "today"]); + + result.Should().Be("Learn Elasticsearch basics today"); + } + + [Fact] + public void MultiplePartialHighlightsHighlightsGaps() + { + var text = "APM and logging for observability"; + var result = text.HighlightTokens(["apm", "and", "logging", "observability"]); + + result.Should().Be("APM and logging for observability"); + } + + [Fact] + public void BrokenMarkTagArkFragmentHandledSafely() + { + // Malformed HTML with "ark>" fragment + var text = "This has ark> in it and some test content"; + var result = text.HighlightTokens(["test"]); + + result.Should().Be("This has ark> in it and some test content"); + } + + [Fact] + public void BrokenMarkTagMaFragmentHandledSafely() + { + // Malformed HTML with "test content"); + } + + [Fact] + public void BrokenMarkTagMarkWithoutCloseHandledSafely() + { + // Unclosed tag + var text = "This has unclosed and test content"; + var result = text.HighlightTokens(["test"]); + + // Content after unclosed is considered inside the tag + result.Should().Be("This has unclosed and test content"); + } + + [Fact] + public void BrokenMarkTagCloseWithoutOpenHandledSafely() + { + // without opening tag + var text = "This has orphan and test content"; + var result = text.HighlightTokens(["test", "orphan"]); + + result.Should().Be("This has orphan and test content"); + } + + [Fact] + public void BrokenMarkTagPartialOpenTagHandledSafely() + { + // Partial "test content"); + } + + [Fact] + public void BrokenMarkTagJustAngleBracketsHandledSafely() + { + var text = "Use < and > for comparisons and test values"; + var result = text.HighlightTokens(["test"]); + + result.Should().Be("Use < and > for comparisons and test values"); + } + + [Fact] + public void PartialHighlightTokenSpansHighlightBoundary() + { + // Token "search" spans from outside to inside highlighted area + var text = "fulltextsearch capabilities"; + var result = text.HighlightTokens(["search", "full"]); + + result.Should().Be("fulltextsearch capabilities"); + } + + [Fact] + public void PartialHighlightAdjacentMarks() + { + var text = "helloworld test"; + var result = text.HighlightTokens(["test"]); + + result.Should().Be("helloworld test"); + } + + [Fact] + public void PartialHighlightNestedLookingContent() + { + // Content that looks like it could be nested (but isn't valid HTML) + var text = "outer text"; + var result = text.HighlightTokens(["text"]); + + // "text" is after so should be highlighted + result.Should().Be("outer text"); + } +} From 6c035717bfb52b95a384f6ce4733dcb72cc791a7 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Fri, 5 Dec 2025 10:43:03 +0100 Subject: [PATCH 2/3] highlight synonyms --- .../Adapters/Search/ElasticsearchGateway.cs | 11 +- .../Search/StringHighlightExtensions.cs | 17 +- .../Search/StringHighlightExtensionsTests.cs | 252 ++++++++++++++++++ 3 files changed, 275 insertions(+), 5 deletions(-) diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index c693b82e4..237925afc 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -310,7 +310,7 @@ public async Task SearchImplementation(string query, int pageNumbe else _logger.LogInformation("RRF search completed for '{Query}'. Total hits: {TotalHits}", query, response.Total); - return ProcessSearchResponse(response, searchQuery); + return ProcessSearchResponse(response, searchQuery, _searchConfiguration.SynonymBiDirectional); } catch (Exception ex) { @@ -319,7 +319,10 @@ public async Task SearchImplementation(string query, int pageNumbe } } - private static SearchResult ProcessSearchResponse(SearchResponse response, string searchQuery) + private static SearchResult ProcessSearchResponse( + SearchResponse response, + string searchQuery, + IReadOnlyDictionary synonyms) { var totalHits = (int)response.Total; var searchTokens = searchQuery.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); @@ -341,13 +344,13 @@ private static SearchResult ProcessSearchResponse(SearchResponse h.Trim(['|', ' ', '.', '-']))); } - var title = (highlightedTitle ?? doc.Title).HighlightTokens(searchTokens); + var title = (highlightedTitle ?? doc.Title).HighlightTokens(searchTokens, synonyms); var description = (!string.IsNullOrWhiteSpace(highlightedBody) ? highlightedBody : doc.Description ?? string.Empty) .Replace("\r\n", " ") .Replace("\n", " ") .Replace("\r", " ") .Trim(['|', ' ']) - .HighlightTokens(searchTokens); + .HighlightTokens(searchTokens, synonyms); return new SearchResultItem { diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs index ff557a5e9..ab4ba34a2 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs @@ -17,8 +17,12 @@ public static class StringHighlightExtensions /// /// The text to highlight tokens in /// The search tokens to highlight + /// Optional dictionary of synonyms to also highlight /// Text with highlighted tokens - public static string HighlightTokens(this string text, ReadOnlySpan tokens) + public static string HighlightTokens( + this string text, + ReadOnlySpan tokens, + IReadOnlyDictionary? synonyms = null) { if (tokens.Length == 0 || string.IsNullOrEmpty(text)) return text; @@ -30,7 +34,18 @@ public static string HighlightTokens(this string text, ReadOnlySpan toke if (string.IsNullOrEmpty(token)) continue; + // Highlight the token itself result = HighlightSingleToken(result, token); + + // Highlight synonyms for this token + if (synonyms == null || !synonyms.TryGetValue(token, out var tokenSynonyms)) + continue; + + foreach (var synonym in tokenSynonyms) + { + if (!string.IsNullOrEmpty(synonym)) + result = HighlightSingleToken(result, synonym); + } } return result; diff --git a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs index 760d5323d..1b36932ee 100644 --- a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs +++ b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs @@ -431,4 +431,256 @@ public void PartialHighlightNestedLookingContent() // "text" is after so should be highlighted result.Should().Be("outer text"); } + + // ========== Synonyms Tests ========== + + [Fact] + public void SynonymsNullDictionaryHighlightsOnlyTokens() + { + var text = "Kubernetes cluster management"; + var result = text.HighlightTokens(["kubernetes"], null); + + result.Should().Be("Kubernetes cluster management"); + } + + [Fact] + public void SynonymsEmptyDictionaryHighlightsOnlyTokens() + { + var text = "Kubernetes cluster management"; + var synonyms = new Dictionary(); + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("Kubernetes cluster management"); + } + + [Fact] + public void SynonymsHighlightsBothTokenAndSynonym() + { + var text = "Kubernetes and k8s are the same thing"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("Kubernetes and k8s are the same thing"); + } + + [Fact] + public void SynonymsHighlightsMultipleSynonyms() + { + var text = "Use Elasticsearch or ES or elastic for search"; + var synonyms = new Dictionary + { + ["elasticsearch"] = ["es", "elastic"] + }; + var result = text.HighlightTokens(["elasticsearch"], synonyms); + + result.Should().Be("Use Elasticsearch or ES or elastic for search"); + } + + [Fact] + public void SynonymsCaseInsensitiveLookup() + { + var text = "K8S is short for kubernetes"; + var synonyms = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["KUBERNETES"], synonyms); + + result.Should().Be("K8S is short for kubernetes"); + } + + [Fact] + public void SynonymsTokenNotInDictionary() + { + var text = "Logstash is a pipeline tool"; + var synonyms = new Dictionary + { + ["elasticsearch"] = ["es"] + }; + var result = text.HighlightTokens(["logstash"], synonyms); + + result.Should().Be("Logstash is a pipeline tool"); + } + + [Fact] + public void SynonymsEmptySynonymArrayIgnored() + { + var text = "Elasticsearch is powerful"; + var synonyms = new Dictionary + { + ["elasticsearch"] = [] + }; + var result = text.HighlightTokens(["elasticsearch"], synonyms); + + result.Should().Be("Elasticsearch is powerful"); + } + + [Fact] + public void SynonymsEmptyStringsInArrayIgnored() + { + var text = "Kubernetes and k8s cluster"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["", "k8s", null!, ""] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("Kubernetes and k8s cluster"); + } + + [Fact] + public void SynonymsMultipleTokensWithDifferentSynonyms() + { + var text = "Deploy k8s with es and ml for machine learning"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"], + ["elasticsearch"] = ["es"], + ["machine learning"] = ["ml"] + }; + var result = text.HighlightTokens(["kubernetes", "elasticsearch"], synonyms); + + result.Should().Be("Deploy k8s with es and ml for machine learning"); + } + + [Fact] + public void SynonymsAlreadyHighlightedSynonymNotDoubleHighlighted() + { + var text = "Use k8s for Kubernetes deployments"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("Use k8s for Kubernetes deployments"); + } + + [Fact] + public void SynonymsBiDirectionalLookup() + { + // Simulating bi-directional synonyms (as used in SearchConfiguration.SynonymBiDirectional) + var text = "Search with k8s or kubernetes in your cluster"; + var synonyms = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["kubernetes"] = ["k8s"], + ["k8s"] = ["kubernetes"] + }; + var result = text.HighlightTokens(["k8s"], synonyms); + + result.Should().Be("Search with k8s or kubernetes in your cluster"); + } + + [Fact] + public void SynonymsMultipleOccurrencesOfSynonym() + { + var text = "k8s here and k8s there but also kubernetes"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("k8s here and k8s there but also kubernetes"); + } + + [Fact] + public void SynonymsRealWorldElasticSearchExample() + { + var text = "Configure ES cluster settings in Elasticsearch for elastic cloud"; + var synonyms = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["elasticsearch"] = ["es", "elastic"] + }; + var result = text.HighlightTokens(["elasticsearch"], synonyms); + + result.Should().Be("Configure ES cluster settings in Elasticsearch for elastic cloud"); + } + + [Fact] + public void SynonymsRealWorldMachineLearningExample() + { + var text = "ML models for machine learning in the ml node"; + var synonyms = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["machine learning"] = ["ml"] + }; + var result = text.HighlightTokens(["machine learning"], synonyms); + + // Note: "machine learning" as a token matches the phrase, ml is a synonym + result.Should().Be("ML models for machine learning in the ml node"); + } + + [Fact] + public void SynonymsSynonymInsideMarkTagNotHighlighted() + { + var text = "kubernetes and k8s are popular"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + // Both kubernetes and k8s are inside mark tag, should not be double-highlighted + result.Should().Be("kubernetes and k8s are popular"); + } + + [Fact] + public void SynonymsMixedHighlightedAndUnhighlightedSynonyms() + { + var text = "k8s and kubernetes cluster"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + result.Should().Be("k8s and kubernetes cluster"); + } + + [Fact] + public void SynonymsPreservesOriginalCaseForSynonym() + { + var text = "Use K8S for your deployments"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + // Original case "K8S" should be preserved in the highlight + result.Should().Be("Use K8S for your deployments"); + } + + [Fact] + public void SynonymsWithSpecialCharacters() + { + var text = "Use ES|QL or esql for queries"; + var synonyms = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["esql"] = ["ES|QL"] + }; + var result = text.HighlightTokens(["esql"], synonyms); + + result.Should().Be("Use ES|QL or esql for queries"); + } + + [Fact] + public void SynonymsPartialMatchNotHighlighted() + { + // Synonym "k8s" should not match "k8ss" or "ak8s" + var text = "k8ss is not k8s and ak8s is wrong"; + var synonyms = new Dictionary + { + ["kubernetes"] = ["k8s"] + }; + var result = text.HighlightTokens(["kubernetes"], synonyms); + + // k8s within k8ss and ak8s will be highlighted since it's a substring match + // This is expected behavior - same as regular tokens + result.Should().Contain("k8s"); + } } From bfab5fa6cfba46c63ffefd878c4ab57dd8b68895 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Fri, 5 Dec 2025 10:52:52 +0100 Subject: [PATCH 3/3] Bidirectional synonyms now also parses source => target replacement syntax --- .../Search/SearchConfiguration.cs | 10 ++- .../Search/StringHighlightExtensions.cs | 64 +++++++++++++++++-- .../Search/StringHighlightExtensionsTests.cs | 15 +++++ 3 files changed, 83 insertions(+), 6 deletions(-) diff --git a/src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs b/src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs index 070a55704..ae52726af 100644 --- a/src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs +++ b/src/Elastic.Documentation.Configuration/Search/SearchConfiguration.cs @@ -25,10 +25,18 @@ public required IReadOnlyDictionary Synonyms var targets = new List(); foreach (var s in a) { - if (s.Contains(' ') || s.Contains("=>")) + if (s.Contains(' ')) continue; List newTarget = [s]; + if (s.Contains("=>")) + { + var tokens = s.Split("=>"); + if (tokens.Length > 1) + newTarget = [tokens[0].Trim()]; + else + continue; + } newTarget.AddRange(a.Except([s])); targets.Add(newTarget.ToArray()); } diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs index ab4ba34a2..b0f331a37 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/StringHighlightExtensions.cs @@ -37,20 +37,74 @@ public static string HighlightTokens( // Highlight the token itself result = HighlightSingleToken(result, token); - // Highlight synonyms for this token - if (synonyms == null || !synonyms.TryGetValue(token, out var tokenSynonyms)) + if (synonyms == null) continue; - foreach (var synonym in tokenSynonyms) + // Highlight synonyms for this token (direct lookup) + if (synonyms.TryGetValue(token, out var tokenSynonyms)) { - if (!string.IsNullOrEmpty(synonym)) - result = HighlightSingleToken(result, synonym); + foreach (var synonym in tokenSynonyms) + { + var synonymToHighlight = ExtractSynonymTarget(synonym); + if (!string.IsNullOrEmpty(synonymToHighlight)) + result = HighlightSingleToken(result, synonymToHighlight); + } + } + + // Also check for hard replacements where this token is the source + // Format: "source => target" means when searching for "source", also highlight "target" + foreach (var kvp in synonyms) + { + foreach (var synonym in kvp.Value) + { + if (string.IsNullOrEmpty(synonym) || !synonym.Contains("=>")) + continue; + + var (source, target) = ParseHardReplacement(synonym); + if (!string.IsNullOrEmpty(source) && + !string.IsNullOrEmpty(target) && + source.Equals(token, StringComparison.OrdinalIgnoreCase)) + { + result = HighlightSingleToken(result, target); + } + } } } return result; } + /// + /// Extracts the target from a synonym entry, handling hard replacement format. + /// For "source => target" returns "target", otherwise returns the original synonym. + /// + private static string? ExtractSynonymTarget(string? synonym) + { + if (string.IsNullOrEmpty(synonym)) + return null; + + if (!synonym.Contains("=>")) + return synonym; + + var (_, target) = ParseHardReplacement(synonym); + return target; + } + + /// + /// Parses a hard replacement synonym format: "source => target" + /// + private static (string? Source, string? Target) ParseHardReplacement(string synonym) + { + var arrowIndex = synonym.IndexOf("=>", StringComparison.Ordinal); + if (arrowIndex < 0) + return (null, null); + + var source = synonym[..arrowIndex].Trim(); + var target = synonym[(arrowIndex + 2)..].Trim(); + + return (source, target); + } + private static string HighlightSingleToken(string text, string token) { // Check if this exact token is already fully highlighted somewhere diff --git a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs index 1b36932ee..f33db93a3 100644 --- a/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs +++ b/tests/Elastic.Documentation.Api.Infrastructure.Tests/Adapters/Search/StringHighlightExtensionsTests.cs @@ -683,4 +683,19 @@ public void SynonymsPartialMatchNotHighlighted() // This is expected behavior - same as regular tokens result.Should().Contain("k8s"); } + + [Fact] + public void SynonymsHardReplacements() + { + var text = "ES|QL is esql and not EQL"; + var synonyms = new Dictionary + { + ["esql"] = ["es|ql => esql"] + }; + var result = text.HighlightTokens(["es|ql"], synonyms); + + // k8s within k8ss and ak8s will be highlighted since it's a substring match + // This is expected behavior - same as regular tokens + result.Should().Contain("ES|QL is esql and not EQL"); + } }