From fb9b9278bbcdf3d8195c1e72c7fabd3a5a6abeda Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 7 Oct 2025 12:05:36 +0200 Subject: [PATCH] Ensure Elasticsearch documents have an _id and track content hash for partial updates --- .../Search/DocumentationDocument.cs | 12 +++-- .../ElasticsearchMarkdownExporter.cs | 46 ++++++++++++------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index 28c23ee3f..25660a76e 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -4,6 +4,7 @@ using System.Text.Json.Serialization; using Elastic.Documentation.AppliesTo; +using Elastic.Documentation.Extensions; namespace Elastic.Documentation.Search; @@ -18,6 +19,14 @@ public record ParentDocument public record DocumentationDocument { + // TODO make this required once all doc_sets have published again + [JsonPropertyName("url")] + public string Url { get; set; } = string.Empty; + + // TODO make this required once all doc_sets have published again + [JsonPropertyName("hash")] + public string Hash { get; set; } = string.Empty; + [JsonPropertyName("title")] public string? Title { get; set; } @@ -30,9 +39,6 @@ public record DocumentationDocument [JsonPropertyName("links")] public string[] Links { get; set; } = []; - [JsonPropertyName("url")] - public string? Url { get; set; } - [JsonPropertyName("applies_to")] public ApplicableTo? Applies { get; set; } diff --git a/src/Elastic.Markdown/Exporters/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/ElasticsearchMarkdownExporter.cs index b3747212d..b253447f2 100644 --- a/src/Elastic.Markdown/Exporters/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/ElasticsearchMarkdownExporter.cs @@ -6,6 +6,7 @@ using Elastic.Channels; using Elastic.Documentation.Configuration; using Elastic.Documentation.Diagnostics; +using Elastic.Documentation.Extensions; using Elastic.Documentation.Search; using Elastic.Documentation.Serialization; using Elastic.Ingest.Elasticsearch; @@ -26,8 +27,9 @@ public class ElasticsearchMarkdownExporter(ILoggerFactory logFactory, IDiagnosti /// protected override CatalogIndexChannelOptions NewOptions(DistributedTransport transport) => new(transport) { + BulkOperationIdLookup = d => d.Url, GetMapping = () => CreateMapping(null), - GetMappingSettings = () => CreateMappingSetting(), + GetMappingSettings = CreateMappingSetting, IndexFormat = $"{Endpoint.IndexNamePrefix.ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", ActiveSearchAlias = $"{Endpoint.IndexNamePrefix}-{indexNamespace.ToLowerInvariant()}", }; @@ -43,13 +45,14 @@ public class ElasticsearchMarkdownSemanticExporter(ILoggerFactory logFactory, ID /// protected override SemanticIndexChannelOptions NewOptions(DistributedTransport transport) => new(transport) { + BulkOperationIdLookup = d => d.Url, GetMapping = (inferenceId, _) => CreateMapping(inferenceId), GetMappingSettings = (_, _) => CreateMappingSetting(), IndexFormat = $"{Endpoint.IndexNamePrefix.ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", ActiveSearchAlias = $"{Endpoint.IndexNamePrefix}-{indexNamespace.ToLowerInvariant()}", IndexNumThreads = Endpoint.IndexNumThreads, SearchNumThreads = Endpoint.SearchNumThreads, - InferenceCreateTimeout = TimeSpan.FromMinutes(Endpoint.BootstrapTimeout ?? 4) + InferenceCreateTimeout = TimeSpan.FromMinutes(Endpoint.BootstrapTimeout ?? 4), }; /// @@ -86,7 +89,8 @@ protected static string CreateMappingSetting() => "lowercase", "synonyms_filter" ] - } + }, + "hierarchy_analyzer": { "tokenizer": "path_tokenizer" } }, "filter": { "synonyms_filter": { @@ -94,6 +98,12 @@ protected static string CreateMappingSetting() => "synonyms_set": "docs", "updateable": true } + }, + "tokenizer": { + "path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "/" + } } } } @@ -103,22 +113,22 @@ protected static string CreateMapping(string? inferenceId) => $$""" { "properties": { - "title": { - "type": "text", - "search_analyzer": "synonyms_analyzer", + "url" : { + "type": "keyword", "fields": { - "keyword": { - "type": "keyword" - } - {{(!string.IsNullOrWhiteSpace(inferenceId) ? $$""", "semantic_text": {{{InferenceMapping(inferenceId)}}}""" : "")}} + "match": { "type": "text" }, + "prefix": { "type": "text", "analyzer" : "hierarchy_analyzer" } } }, - "url": { + "hash" : { "type" : "keyword" }, + "title": { "type": "text", + "search_analyzer": "synonyms_analyzer", "fields": { "keyword": { "type": "keyword" } + {{(!string.IsNullOrWhiteSpace(inferenceId) ? $$""", "semantic_text": {{{InferenceMapping(inferenceId)}}}""" : "")}} } }, "url_segment_count": { @@ -275,16 +285,18 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, .Where(text => !string.IsNullOrEmpty(text)) .ToArray(); + var @abstract = !string.IsNullOrEmpty(body) + ? body[..Math.Min(body.Length, 400)] + " " + string.Join(" \n- ", headings) + : string.Empty; + var doc = new DocumentationDocument { - Title = file.Title, Url = url, + Hash = ShortId.Create(url, body), + Title = file.Title, Body = body, Description = fileContext.SourceFile.YamlFrontMatter?.Description, - - Abstract = !string.IsNullOrEmpty(body) - ? body[..Math.Min(body.Length, 400)] + " " + string.Join(" \n- ", headings) - : string.Empty, + Abstract = @abstract, Applies = fileContext.SourceFile.YamlFrontMatter?.AppliesTo, UrlSegmentCount = url.Split('/', StringSplitOptions.RemoveEmptyEntries).Length, Parents = navigation.GetParentsOfMarkdownFile(file).Select(i => new ParentDocument @@ -292,7 +304,7 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Title = i.NavigationTitle, Url = i.Url }).Reverse().ToArray(), - Headings = headings + Headings = headings, }; return await TryWrite(doc, ctx); }