From f075e3b7b35847536f164109fd120b7761dec0b5 Mon Sep 17 00:00:00 2001 From: Felipe Cotti Date: Mon, 17 Nov 2025 17:17:39 -0300 Subject: [PATCH 1/3] Adjust analyzers and query retriever priorities to improve search results between synonyms --- .../Elasticsearch/ElasticsearchExporter.cs | 17 +++++++++++++---- .../Adapters/Search/ElasticsearchGateway.cs | 7 +++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs index 3b8e42a23..eb37958ef 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs @@ -62,7 +62,6 @@ DistributedTransport transport SearchInferenceId = endpoint.NoElasticInferenceService ? null : ".elser-2-elastic" }); - public abstract class ElasticsearchExporter : IDisposable where TChannelOptions : CatalogIndexChannelOptionsBase where TChannel : CatalogIndexChannel @@ -148,6 +147,14 @@ protected static string CreateMappingSetting(string synonymSetName) => { "analysis": { "analyzer": { + "combined_analyzer": { + "tokenizer": "whitespace", + "filter": [ + "lowercase", + "synonyms_filter", + "english_stop" + ] + }, "synonyms_analyzer": { "tokenizer": "whitespace", "filter": [ @@ -243,7 +250,7 @@ protected static string CreateMapping(string? inferenceId) => }, "stripped_body": { "type": "text", - "search_analyzer": "highlight_analyzer", + "search_analyzer": "combined_analyzer", "term_vector": "with_positions_offsets" } {{(!string.IsNullOrWhiteSpace(inferenceId) ? AbstractInferenceMapping(inferenceId) : AbstractMapping())}} @@ -253,7 +260,10 @@ protected static string CreateMapping(string? inferenceId) => private static string AbstractMapping() => """ - , "abstract": { "type": "text" } + , "abstract": { + "type": "text", + "search_analyzer": "combined_analyzer" + } """; private static string InferenceMapping(string inferenceId) => @@ -278,5 +288,4 @@ public void Dispose() GC.SuppressFinalize(this); } - } diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index 615ac9567..1ad5ddd94 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -88,10 +88,13 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger(f => f.Title.Suffix("keyword")), searchQuery) { Boost = 10.0f, CaseInsensitive = true } + || new MatchPhrasePrefixQuery(Infer.Field(f => f.Title), searchQuery) { Boost = 9.0f } || new MatchQuery(Infer.Field(f => f.Title), searchQuery) { Operator = Operator.And, Boost = 8.0f } || new MatchBoolPrefixQuery(Infer.Field(f => f.Title), searchQuery) { Boost = 6.0f } - || new MatchQuery(Infer.Field(f => f.Abstract), searchQuery) { Boost = 4.0f } - || new MatchQuery(Infer.Field(f => f.StrippedBody), searchQuery) { Boost = 3.0f } + || new MatchQuery(Infer.Field(f => f.Abstract), searchQuery) { Operator = Operator.And, Boost = 5.0f } + || new MatchQuery(Infer.Field(f => f.StrippedBody), searchQuery) { Operator = Operator.And, Boost = 4.5f } + || new MatchQuery(Infer.Field(f => f.Abstract), searchQuery) { Operator = Operator.Or, Boost = 4.0f } + || new MatchQuery(Infer.Field(f => f.StrippedBody), searchQuery) { Operator = Operator.Or, Boost = 3.0f } || new MatchQuery(Infer.Field(f => f.Parents.First().Title), searchQuery) { Boost = 2.0f } || new MatchQuery(Infer.Field(f => f.Title), searchQuery) { Fuzziness = 1, Boost = 1.0f } ) From ff28793214556f47cc745aa1c79105c3764b8625 Mon Sep 17 00:00:00 2001 From: Felipe Cotti Date: Mon, 17 Nov 2025 17:49:58 -0300 Subject: [PATCH 2/3] Add document headings to the queryable content --- .../Exporters/Elasticsearch/ElasticsearchExporter.cs | 6 +++++- .../Search/SearchUsecase.cs | 1 + .../Adapters/Search/ElasticsearchGateway.cs | 9 ++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs index eb37958ef..cca083c7b 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs @@ -252,7 +252,11 @@ protected static string CreateMapping(string? inferenceId) => "type": "text", "search_analyzer": "combined_analyzer", "term_vector": "with_positions_offsets" - } + }, + "headings": { + "type": "text", + "search_analyzer": "synonyms_analyzer" + }, {{(!string.IsNullOrWhiteSpace(inferenceId) ? AbstractInferenceMapping(inferenceId) : AbstractMapping())}} } } diff --git a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs index 49b23cb9f..7d5d4e6de 100644 --- a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs +++ b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs @@ -72,6 +72,7 @@ public record SearchResultItem public required string Title { get; init; } public required string Description { get; init; } public required SearchResultItemParent[] Parents { get; init; } + public string[]? Headings { get; init; } public float Score { get; init; } public string? HighlightedBody { get; init; } } diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index 1ad5ddd94..d2bbaaf33 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -37,6 +37,9 @@ internal sealed record DocumentDto [JsonPropertyName("url_segment_count")] public int UrlSegmentCount { get; init; } + [JsonPropertyName("headings")] + public string[] Headings { get; init; } = []; + [JsonPropertyName("parents")] public ParentDocumentDto[] Parents { get; init; } = []; @@ -93,8 +96,10 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger(f => f.Title), searchQuery) { Boost = 6.0f } || new MatchQuery(Infer.Field(f => f.Abstract), searchQuery) { Operator = Operator.And, Boost = 5.0f } || new MatchQuery(Infer.Field(f => f.StrippedBody), searchQuery) { Operator = Operator.And, Boost = 4.5f } + || new MatchQuery(Infer.Field(f => f.Headings), searchQuery) { Operator = Operator.And, Boost = 4.5f } || new MatchQuery(Infer.Field(f => f.Abstract), searchQuery) { Operator = Operator.Or, Boost = 4.0f } || new MatchQuery(Infer.Field(f => f.StrippedBody), searchQuery) { Operator = Operator.Or, Boost = 3.0f } + || new MatchQuery(Infer.Field(f => f.Headings), searchQuery) { Operator = Operator.Or, Boost = 3.0f } || new MatchQuery(Infer.Field(f => f.Parents.First().Title), searchQuery) { Boost = 2.0f } || new MatchQuery(Infer.Field(f => f.Title), searchQuery) { Fuzziness = 1, Boost = 1.0f } ) @@ -132,7 +137,8 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger e.Title, e => e.Url, e => e.Description, - e => e.Parents + e => e.Parents, + e => e.Headings ) ) ) @@ -196,6 +202,7 @@ private static (int TotalHits, List Results) ProcessSearchResp Url = doc.Url, Title = doc.Title, Description = doc.Description ?? string.Empty, + Headings = doc.Headings, Parents = doc.Parents.Select(parent => new SearchResultItemParent { Title = parent.Title, From d1f45f625f049203bbbf441af0c2ec276f491421 Mon Sep 17 00:00:00 2001 From: Felipe Cotti Date: Mon, 17 Nov 2025 18:08:20 -0300 Subject: [PATCH 3/3] Add custom tokenizer and simplify analyzers --- .../Elasticsearch/ElasticsearchExporter.cs | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs index cca083c7b..a0942767f 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs @@ -147,23 +147,15 @@ protected static string CreateMappingSetting(string synonymSetName) => { "analysis": { "analyzer": { - "combined_analyzer": { - "tokenizer": "whitespace", - "filter": [ - "lowercase", - "synonyms_filter", - "english_stop" - ] - }, "synonyms_analyzer": { - "tokenizer": "whitespace", + "tokenizer": "group_tokenizer", "filter": [ "lowercase", "synonyms_filter" ] }, "highlight_analyzer": { - "tokenizer": "standard", + "tokenizer": "group_tokenizer", "filter": [ "lowercase", "english_stop" @@ -183,7 +175,11 @@ protected static string CreateMappingSetting(string synonymSetName) => } }, "tokenizer": { - "path_tokenizer": { + "group_tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ] + }, + "path_tokenizer": { "type": "path_hierarchy", "delimiter": "/" } @@ -250,7 +246,7 @@ protected static string CreateMapping(string? inferenceId) => }, "stripped_body": { "type": "text", - "search_analyzer": "combined_analyzer", + "search_analyzer": "synonyms_analyzer", "term_vector": "with_positions_offsets" }, "headings": { @@ -266,7 +262,7 @@ private static string AbstractMapping() => """ , "abstract": { "type": "text", - "search_analyzer": "combined_analyzer" + "search_analyzer": "synonyms_analyzer" } """;