diff --git a/Directory.Packages.props b/Directory.Packages.props index 11b537444..4e16e1cf8 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -26,6 +26,7 @@ + @@ -37,7 +38,7 @@ - + @@ -103,4 +104,4 @@ - + \ No newline at end of file diff --git a/aspire/AppHost.cs b/aspire/AppHost.cs index dfa05ddc4..d35dbd809 100644 --- a/aspire/AppHost.cs +++ b/aspire/AppHost.cs @@ -44,8 +44,7 @@ async Task BuildAspireHost(bool startElasticsearch, bool assumeCloned, bool skip .WithArgs(globalArguments) .WithEnvironment("ENVIRONMENT", "dev") .WithEnvironment("LLM_GATEWAY_FUNCTION_URL", llmUrl) - .WithEnvironment("LLM_GATEWAY_SERVICE_ACCOUNT_KEY_PATH", llmServiceAccountPath) - .WithExplicitStart(); + .WithEnvironment("LLM_GATEWAY_SERVICE_ACCOUNT_KEY_PATH", llmServiceAccountPath); // ReSharper disable once RedundantAssignment api = startElasticsearch @@ -55,12 +54,14 @@ async Task BuildAspireHost(bool startElasticsearch, bool assumeCloned, bool skip .WithEnvironment(context => context.EnvironmentVariables["DOCUMENTATION_ELASTIC_PASSWORD"] = elasticsearchLocal.Resource.PasswordParameter) .WithParentRelationship(elasticsearchLocal) .WaitFor(elasticsearchLocal) + .WithExplicitStart() : api.WithReference(elasticsearchRemote) .WithEnvironment("DOCUMENTATION_ELASTIC_URL", elasticsearchUrl) - .WithEnvironment("DOCUMENTATION_ELASTIC_APIKEY", elasticsearchApiKey); + .WithEnvironment("DOCUMENTATION_ELASTIC_APIKEY", elasticsearchApiKey) + .WithExplicitStart(); - var indexElasticsearch = builder.AddProject(ElasticsearchIndexerPlain) - .WithArgs(["assembler", "build", "--exporters", "elasticsearch", .. globalArguments]) + var indexElasticsearch = builder.AddProject(ElasticsearchIngest) + .WithArgs(["assembler", "index", .. globalArguments]) .WithExplicitStart() .WaitForCompletion(cloneAll); @@ -78,27 +79,6 @@ async Task BuildAspireHost(bool startElasticsearch, bool assumeCloned, bool skip .WithEnvironment("DOCUMENTATION_ELASTIC_APIKEY", elasticsearchApiKey) .WithParentRelationship(elasticsearchRemote); - var indexElasticsearchSemantic = builder.AddProject(ElasticsearchIndexerSemantic) - .WithArgs(["assembler", "build", "--exporters", "semantic", .. globalArguments]) - .WithEnvironment("DOCUMENTATION_ELASTIC_URL", elasticsearchLocal.GetEndpoint("http")) - .WithEnvironment(context => context.EnvironmentVariables["DOCUMENTATION_ELASTIC_PASSWORD"] = elasticsearchLocal.Resource.PasswordParameter) - .WithExplicitStart() - .WaitForCompletion(cloneAll); - - // ReSharper disable once RedundantAssignment - indexElasticsearchSemantic = startElasticsearch - ? indexElasticsearchSemantic - .WaitFor(elasticsearchLocal) - .WithReference(elasticsearchLocal) - .WithEnvironment("DOCUMENTATION_ELASTIC_URL", elasticsearchLocal.GetEndpoint("http")) - .WithEnvironment(context => context.EnvironmentVariables["DOCUMENTATION_ELASTIC_PASSWORD"] = elasticsearchLocal.Resource.PasswordParameter) - .WithParentRelationship(elasticsearchLocal) - : indexElasticsearchSemantic - .WithReference(elasticsearchRemote) - .WithEnvironment("DOCUMENTATION_ELASTIC_URL", elasticsearchUrl) - .WithEnvironment("DOCUMENTATION_ELASTIC_APIKEY", elasticsearchApiKey) - .WithParentRelationship(elasticsearchRemote); - var serveStatic = builder.AddProject(AssemblerServe) .WithEnvironment("LLM_GATEWAY_FUNCTION_URL", llmUrl) .WithEnvironment("LLM_GATEWAY_SERVICE_ACCOUNT_KEY_PATH", llmServiceAccountPath) diff --git a/aspire/ResourceNames.cs b/aspire/ResourceNames.cs index 942899b80..20c451a02 100644 --- a/aspire/ResourceNames.cs +++ b/aspire/ResourceNames.cs @@ -12,6 +12,5 @@ public static class ResourceNames public const string ElasticsearchLocal = "elasticsearch-local"; public const string ElasticsearchRemote = "elasticsearch-remote"; public const string LambdaApi = "lambda-api"; - public const string ElasticsearchIndexerPlain = "elasticsearch-indexer-plain"; - public const string ElasticsearchIndexerSemantic = "elasticsearch-indexer-semantic"; + public const string ElasticsearchIngest = "elasticsearch-ingest"; } diff --git a/src/Elastic.ApiExplorer/Elastic.ApiExplorer.csproj b/src/Elastic.ApiExplorer/Elastic.ApiExplorer.csproj index 3cf18fdd4..36b908d35 100644 --- a/src/Elastic.ApiExplorer/Elastic.ApiExplorer.csproj +++ b/src/Elastic.ApiExplorer/Elastic.ApiExplorer.csproj @@ -23,4 +23,8 @@ + + + + diff --git a/src/Elastic.ApiExplorer/Elasticsearch/OpenApiDocumentExporter.cs b/src/Elastic.ApiExplorer/Elasticsearch/OpenApiDocumentExporter.cs new file mode 100644 index 000000000..cf7d19b01 --- /dev/null +++ b/src/Elastic.ApiExplorer/Elasticsearch/OpenApiDocumentExporter.cs @@ -0,0 +1,365 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Globalization; +using System.Runtime.CompilerServices; +using System.Text; +using System.Text.RegularExpressions; +using Elastic.Documentation; +using Elastic.Documentation.AppliesTo; +using Elastic.Documentation.Configuration.Versions; +using Elastic.Documentation.Search; +using Microsoft.OpenApi; +using Microsoft.OpenApi.Reader; + +namespace Elastic.ApiExplorer.Elasticsearch; + +/// +/// Exports OpenAPI specifications from CloudFront URLs and converts them to DocumentationDocument instances. +/// +public partial class OpenApiDocumentExporter(VersionsConfiguration versionsConfiguration) +{ + private static readonly HttpClient HttpClient = new(); + + private const string ElasticsearchOpenApiUrl = "https://d31bhlox0wglh.cloudfront.net/elasticsearch-openapi-docs.json"; + private const string KibanaOpenApiUrl = "https://d31bhlox0wglh.cloudfront.net/kibana-openapi.json"; + + [GeneratedRegex(@"Added in (\d+\.\d+\.\d+)", RegexOptions.IgnoreCase)] + private static partial Regex AddedInVersionRegex(); + + [GeneratedRegex(@"(\w+)\s*([^<]+)", RegexOptions.IgnoreCase)] + private static partial Regex OperationVerbPathRegex(); + + /// + /// Fetches and processes both Elasticsearch and Kibana OpenAPI specifications. + /// + /// Optional limit of documents to return per source (Elasticsearch and Kibana) + /// Cancellation token + /// Enumerable of DocumentationDocument instances for all endpoints + public async IAsyncEnumerable ExportDocuments(int? limitPerSource = null, [EnumeratorCancellation] Cancel ctx = default) + { + // Process Elasticsearch API + var elasticsearchCount = 0; + await foreach (var doc in ExportFromUrl(ElasticsearchOpenApiUrl, "elasticsearch", ctx)) + { + yield return doc; + elasticsearchCount++; + if (limitPerSource.HasValue && elasticsearchCount >= limitPerSource.Value) + break; + } + + // Process Kibana API + var kibanaCount = 0; + await foreach (var doc in ExportFromUrl(KibanaOpenApiUrl, "kibana", ctx)) + { + yield return doc; + kibanaCount++; + if (limitPerSource.HasValue && kibanaCount >= limitPerSource.Value) + break; + } + } + + /// + /// Fetches OpenAPI spec from a URL and converts it to DocumentationDocument instances. + /// + private async IAsyncEnumerable ExportFromUrl( + string url, + string product, + [EnumeratorCancellation] Cancel ctx) + { + var openApiDocument = await FetchOpenApiDocument(url, ctx); + if (openApiDocument == null) + yield break; + + foreach (var doc in ConvertToDocuments(openApiDocument, product)) + yield return doc; + } + + /// + /// Fetches and parses an OpenAPI document from a URL. + /// + private static async Task FetchOpenApiDocument(string url, Cancel ctx) + { + try + { + var response = await HttpClient.GetAsync(url, ctx); + _ = response.EnsureSuccessStatusCode(); + + await using var stream = await response.Content.ReadAsStreamAsync(ctx); + var settings = new OpenApiReaderSettings { LeaveStreamOpen = false }; + var openApiDocument = await OpenApiDocument.LoadAsync(stream, settings: settings, cancellationToken: ctx); + + return openApiDocument.Document; + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to fetch OpenAPI document from {url}: {ex.Message}"); + return null; + } + } + + /// + /// Converts an OpenAPI document to DocumentationDocument instances. + /// + private IEnumerable ConvertToDocuments(OpenApiDocument openApiDocument, string product) + { + foreach (var path in openApiDocument.Paths) + { + if (path.Value.Operations == null) + continue; + + foreach (var operation in path.Value.Operations) + { + var operationId = operation.Value.OperationId ?? GenerateOperationId(operation.Key, path.Key); + + // Check x-state extension for version filtering + if (!ShouldIncludeOperation(operation.Value, product)) + continue; + + var url = $"/docs/api/doc/{product}/operation/operation-{operationId.ToLowerInvariant()}"; + + var productName = CultureInfo.InvariantCulture.TextInfo.ToTitleCase(product); + // inject product name into title to ensure differentiation and better scoring + var title = $"{operation.Value.Summary ?? operationId} - {productName} API "; + var description = TransformOperationListToMarkdown(operation.Value.Description); + + // Build body content from operation details + var bodyBuilder = new StringBuilder(); + _ = bodyBuilder.AppendLine($"# {title}"); + _ = bodyBuilder.AppendLine(); + + if (!string.IsNullOrEmpty(description)) + { + _ = bodyBuilder.AppendLine(description); + _ = bodyBuilder.AppendLine(); + } + + _ = bodyBuilder.AppendLine($"**Method:** {operation.Key.ToString().ToUpperInvariant()}"); + _ = bodyBuilder.AppendLine($"**Path:** {path.Key}"); + _ = bodyBuilder.AppendLine(); + + // Add parameters if any + if (operation.Value.Parameters?.Count > 0) + { + _ = bodyBuilder.AppendLine("## Parameters"); + foreach (var param in operation.Value.Parameters) + _ = bodyBuilder.AppendLine($"- **{param.Name}** ({param.In}): {param.Description}"); + _ = bodyBuilder.AppendLine(); + } + + var body = bodyBuilder.ToString(); + + // Extract tags as headings + var headings = operation.Value.Tags? + .Select(t => t.Name) + .Where(n => !string.IsNullOrEmpty(n)) + .OfType() + .ToArray() ?? []; + + // Extract ApplicableTo from x-state + var applies = ExtractApplicableTo(operation.Value); + + yield return new DocumentationDocument + { + Type = "api", + Url = url, + Title = title, + Description = description, + Body = body, + StrippedBody = body, + Headings = headings, + Links = [], + Applies = applies, + Parents = + [ + new ParentDocument { Title = "API Reference", Url = "/docs/api" }, + new ParentDocument { Title = product, Url = $"/docs/api/doc/{product}" } + ] + }; + } + } + } + + /// + /// Determines if an operation should be included based on its x-state extension. + /// + private bool ShouldIncludeOperation(OpenApiOperation operation, string product) + { + // Try to get x-state extension + if (operation.Extensions == null || !operation.Extensions.TryGetValue("x-state", out var stateExtension)) + return true; // No x-state, safe to include + + // Get the state string value from JsonNodeExtension + if (stateExtension is not JsonNodeExtension jsonNodeExtension) + return true; // Not a JSON node, safe to include + + var stateValue = jsonNodeExtension.Node.GetValue(); + if (string.IsNullOrEmpty(stateValue)) + return true; // Empty state, safe to include + + // Parse version from "Added in X.Y.Z" + var match = AddedInVersionRegex().Match(stateValue); + if (!match.Success) + return true; // No version found, safe to include + + var versionString = match.Groups[1].Value; + if (!SemVersion.TryParse(versionString, out var addedInVersion)) + return true; // Could not parse version, safe to include + + // Get current version for the product + var versioningSystemId = product == "elasticsearch" + ? VersioningSystemId.Stack + : VersioningSystemId.Stack; // Both use Stack for now + + var versioningSystem = versionsConfiguration.GetVersioningSystem(versioningSystemId); + var currentVersion = versioningSystem.Current; + + // Include if added version is <= current version + return addedInVersion <= currentVersion; + } + + /// + /// Generates an operation ID from method and path when one is not provided. + /// + private static string GenerateOperationId(HttpMethod method, string path) + { + var cleanPath = path.TrimStart('/').Replace('/', '-').Replace('{', '-').Replace('}', '-'); + return $"{method.ToString().ToLowerInvariant()}-{cleanPath}"; + } + + /// + /// Extracts ApplicableTo information from an operation's x-state extension. + /// + private static ApplicableTo? ExtractApplicableTo(OpenApiOperation operation) + { + // Try to get x-state extension + if (operation.Extensions == null || !operation.Extensions.TryGetValue("x-state", out var stateExtension)) + return null; + + // Get the state string value from JsonNodeExtension + if (stateExtension is not JsonNodeExtension jsonNodeExtension) + return null; + + var stateValue = jsonNodeExtension.Node.GetValue(); + if (string.IsNullOrEmpty(stateValue)) + return null; + + // Parse lifecycle from state string (e.g., "Generally available; Added in 9.3.0") + var lifecycle = ParseLifecycle(stateValue); + + // Parse version from "Added in X.Y.Z" + var version = ParseVersion(stateValue); + + // Create Applicability instance + var applicability = new Applicability + { + Lifecycle = lifecycle, + Version = version + }; + + // Create AppliesCollection + var appliesCollection = new AppliesCollection([applicability]); + + // Return ApplicableTo with Stack set + return new ApplicableTo + { + Stack = appliesCollection + }; + } + + /// + /// Parses the product lifecycle from the x-state string. + /// + private static ProductLifecycle ParseLifecycle(string stateValue) + { + var lower = stateValue.ToLowerInvariant(); + + if (lower.Contains("generally available")) + return ProductLifecycle.GenerallyAvailable; + if (lower.Contains("beta")) + return ProductLifecycle.Beta; + if (lower.Contains("tech") && lower.Contains("preview")) + return ProductLifecycle.TechnicalPreview; + if (lower.Contains("deprecated")) + return ProductLifecycle.Deprecated; + if (lower.Contains("removed")) + return ProductLifecycle.Removed; + + // Default to GA if we can't parse + return ProductLifecycle.GenerallyAvailable; + } + + /// + /// Parses the version from "Added in X.Y.Z" pattern in the x-state string. + /// + private static SemVersion? ParseVersion(string stateValue) + { + var match = AddedInVersionRegex().Match(stateValue); + if (!match.Success) + return null; + + var versionString = match.Groups[1].Value; + return SemVersion.TryParse(versionString, out var version) ? version : null; + } + + /// + /// Transforms HTML operation lists in descriptions to markdown format. + /// Detects "**All methods and paths for this operation:**" followed by HTML divs/spans + /// and converts them to a markdown list appended at the end. + /// + private static string TransformOperationListToMarkdown(string? description) + { + if (string.IsNullOrEmpty(description)) + return description ?? string.Empty; + + // Check if description starts with the operations list header + if (!description.Contains("**All methods and paths for this operation:**")) + return description; + + // Extract all operation verb and path pairs + var matches = OperationVerbPathRegex().Matches(description); + if (matches.Count == 0) + return description; + + // Find where the HTML content starts and ends + var htmlStartIndex = description.IndexOf("
", StringComparison.Ordinal); + var lastMatchEnd = matches[^1].Index + matches[^1].Length; + + // Find the last closing div after the last match + var htmlEndIndex = description.IndexOf("
", lastMatchEnd, StringComparison.Ordinal); + if (htmlEndIndex == -1 || htmlStartIndex == -1) + return description; + + // Build the clean description without HTML + var beforeHtml = description[..htmlStartIndex].Trim(); + var afterHtml = description[(htmlEndIndex + 6)..].Trim(); + + // Build markdown list + var markdownList = new StringBuilder(); + _ = markdownList.AppendLine(); + _ = markdownList.AppendLine(); + + foreach (Match match in matches) + { + var verb = match.Groups[2].Value.ToUpperInvariant(); + var path = match.Groups[3].Value; + _ = markdownList.AppendLine($"- **{verb}** `{path}`"); + } + + // Combine: clean description (before + after HTML) + markdown list at the end + var result = new StringBuilder(); + _ = result.Append(beforeHtml); + if (!string.IsNullOrWhiteSpace(afterHtml)) + { + _ = result.AppendLine(); + _ = result.AppendLine(); + _ = result.Append(afterHtml); + } + + // Append markdown list at the end + _ = result.Append(markdownList); + + return result.ToString().Trim(); + } +} diff --git a/src/Elastic.Documentation.Configuration/IDocumentationConfigurationContext.cs b/src/Elastic.Documentation.Configuration/IDocumentationConfigurationContext.cs index 694b27771..12b45994c 100644 --- a/src/Elastic.Documentation.Configuration/IDocumentationConfigurationContext.cs +++ b/src/Elastic.Documentation.Configuration/IDocumentationConfigurationContext.cs @@ -17,7 +17,6 @@ public interface IConfigurationContext ProductsConfiguration ProductsConfiguration { get; } LegacyUrlMappingConfiguration LegacyUrlMappings { get; } SynonymsConfiguration SynonymsConfiguration { get; } - } /// Used only to seed in DI, you primarily want to depend on @@ -42,4 +41,7 @@ public class ConfigurationContext : IConfigurationContext public required SynonymsConfiguration SynonymsConfiguration { get; init; } } -public interface IDocumentationConfigurationContext : IDocumentationContext, IConfigurationContext; +public interface IDocumentationConfigurationContext : IDocumentationContext, IConfigurationContext +{ + Uri? CanonicalBaseUrl { get; } +} diff --git a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts index 4eca72490..ad302a11c 100644 --- a/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts +++ b/src/Elastic.Documentation.Site/Assets/web-components/SearchOrAskAi/Search/useSearchQuery.ts @@ -21,6 +21,7 @@ const SearchResultItemParent = z.object({ }) const SearchResultItem = z.object({ + type: z.string().default('doc'), url: z.string(), title: z.string(), description: z.string(), diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index 8470fc3b5..78207d7a5 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -18,6 +18,9 @@ public record ParentDocument public record DocumentationDocument { + [JsonPropertyName("type")] + public string Type { get; set; } = "doc"; + // TODO make this required once all doc_sets have published again [JsonPropertyName("url")] public string Url { get; set; } = string.Empty; @@ -57,9 +60,6 @@ public record DocumentationDocument [JsonPropertyName("stripped_body")] public string? StrippedBody { get; set; } - [JsonPropertyName("url_segment_count")] - public int? UrlSegmentCount { get; set; } - [JsonPropertyName("abstract")] public string? Abstract { get; set; } diff --git a/src/Elastic.Markdown/Elastic.Markdown.csproj b/src/Elastic.Markdown/Elastic.Markdown.csproj index 382b8c652..058ffc171 100644 --- a/src/Elastic.Markdown/Elastic.Markdown.csproj +++ b/src/Elastic.Markdown/Elastic.Markdown.csproj @@ -32,6 +32,7 @@ + diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index bf0716fbb..b793fc89c 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -5,18 +5,19 @@ using System.IO.Abstractions; using System.Text.Json; using System.Text.Json.Serialization; +using Elastic.ApiExplorer.Elasticsearch; using Elastic.Documentation.AppliesTo; using Elastic.Documentation.Configuration; -using Elastic.Documentation.Configuration.Synonyms; +using Elastic.Documentation.Configuration.Versions; using Elastic.Documentation.Diagnostics; using Elastic.Documentation.Navigation; using Elastic.Documentation.Search; using Elastic.Ingest.Elasticsearch; using Elastic.Ingest.Elasticsearch.Indices; using Elastic.Markdown.Helpers; -using Elastic.Markdown.IO; using Elastic.Transport; using Elastic.Transport.Products.Elasticsearch; +using Markdig.Parsers; using Markdig.Syntax; using Microsoft.Extensions.Logging; using NetEscapades.EnumGenerators; @@ -29,6 +30,7 @@ public enum IngestStrategy { Reindex, Multiplex } public class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposable { private readonly IDiagnosticsCollector _collector; + private readonly IDocumentationConfigurationContext _context; private readonly ILogger _logger; private readonly ElasticsearchLexicalExporter _lexicalChannel; private readonly ElasticsearchSemanticExporter _semanticChannel; @@ -43,20 +45,24 @@ public class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposable private string _currentSemanticHash = string.Empty; private readonly IReadOnlyCollection _synonyms; + private readonly VersionsConfiguration _versionsConfiguration; public ElasticsearchMarkdownExporter( ILoggerFactory logFactory, IDiagnosticsCollector collector, DocumentationEndpoints endpoints, string indexNamespace, - SynonymsConfiguration synonyms + IDocumentationConfigurationContext context ) { _collector = collector; + _context = context; _logger = logFactory.CreateLogger(); _endpoint = endpoints.Elasticsearch; _indexStrategy = IngestStrategy.Reindex; _indexNamespace = indexNamespace; + _versionsConfiguration = context.VersionsConfiguration; + _synonyms = context.SynonymsConfiguration.Synonyms; var es = endpoints.Elasticsearch; var configuration = new ElasticsearchConfiguration(es.Uri) @@ -83,7 +89,6 @@ SynonymsConfiguration synonyms }; _transport = new DistributedTransport(configuration); - _synonyms = synonyms.Synonyms; _lexicalChannel = new ElasticsearchLexicalExporter(logFactory, collector, es, indexNamespace, _transport); _semanticChannel = new ElasticsearchSemanticExporter(logFactory, collector, es, indexNamespace, _transport); } @@ -380,6 +385,21 @@ private async ValueTask DoReindex(PostData request, string lexicalWriteAlias, st } while (!completed); } + /// + /// Assigns hash, last updated, and batch index date to a documentation document. + /// + private void AssignDocumentMetadata(DocumentationDocument doc) + { + var semanticHash = _semanticChannel.Channel.ChannelHash; + var lexicalHash = _lexicalChannel.Channel.ChannelHash; + var hash = HashedBulkUpdate.CreateHash(semanticHash, lexicalHash, + doc.Url, doc.Type, doc.Body ?? string.Empty, string.Join(",", doc.Headings.OrderBy(h => h)), doc.Url + ); + doc.Hash = hash; + doc.LastUpdated = _batchIndexDate; + doc.BatchIndexDate = _batchIndexDate; + } + public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx) { var file = fileContext.SourceFile; @@ -408,8 +428,9 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, .Where(text => !string.IsNullOrEmpty(text)) .ToArray(); - var @abstract = !string.IsNullOrEmpty(body) - ? body[..Math.Min(body.Length, 400)] + " " + string.Join(" \n- ", headings) + var strippedBody = body.StripMarkdown(); + var @abstract = !string.IsNullOrEmpty(strippedBody) + ? strippedBody[..Math.Min(strippedBody.Length, 400)] + " " + string.Join(" \n- ", headings) : string.Empty; // this is temporary until https://github.com/elastic/docs-builder/pull/2070 lands @@ -421,11 +442,10 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Url = url, Title = file.Title, Body = body, - StrippedBody = body.StripMarkdown(), + StrippedBody = strippedBody, Description = fileContext.SourceFile.YamlFrontMatter?.Description, Abstract = @abstract, Applies = appliesTo, - UrlSegmentCount = url.Split('/', StringSplitOptions.RemoveEmptyEntries).Length, Parents = navigation.GetParentsOfMarkdownFile(file).Select(i => new ParentDocument { Title = i.NavigationTitle, @@ -434,14 +454,7 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Headings = headings }; - var semanticHash = _semanticChannel.Channel.ChannelHash; - var lexicalHash = _lexicalChannel.Channel.ChannelHash; - var hash = HashedBulkUpdate.CreateHash(semanticHash, lexicalHash, - doc.Url, doc.Body ?? string.Empty, string.Join(",", doc.Headings.OrderBy(h => h)), doc.Url - ); - doc.Hash = hash; - doc.LastUpdated = _batchIndexDate; - doc.BatchIndexDate = _batchIndexDate; + AssignDocumentMetadata(doc); if (_indexStrategy == IngestStrategy.Multiplex) return await _lexicalChannel.TryWrite(doc, ctx) && await _semanticChannel.TryWrite(doc, ctx); @@ -449,7 +462,56 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, } /// - public ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx) => ValueTask.FromResult(true); + public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Cancel ctx) + { + + // this is temporary; once we implement Elastic.ApiExplorer, this should flow through + // we'll rename IMarkdownExporter to IDocumentationFileExporter at that point + _logger.LogInformation("Exporting OpenAPI documentation to Elasticsearch"); + + var exporter = new OpenApiDocumentExporter(_versionsConfiguration); + + await foreach (var doc in exporter.ExportDocuments(limitPerSource: null, ctx)) + { + AssignDocumentMetadata(doc); + var document = MarkdownParser.Parse(doc.Body ?? string.Empty); + + doc.Body = LlmMarkdownExporter.ConvertToLlmMarkdown(document, _context); + + var headings = document.Descendants() + .Select(h => h.GetData("header") as string ?? string.Empty) // TODO: Confirm that 'header' data is correctly set for all HeadingBlock instances and that this extraction is reliable. + .Where(text => !string.IsNullOrEmpty(text)) + .ToArray(); + + doc.StrippedBody = doc.Body.StripMarkdown(); + var @abstract = !string.IsNullOrEmpty(doc.StrippedBody) + ? doc.Body[..Math.Min(doc.StrippedBody.Length, 400)] + " " + string.Join(" \n- ", doc.Headings) + : string.Empty; + doc.Abstract = @abstract; + doc.Headings = headings; + + // Write to channels following the multiplex or reindex strategy + if (_indexStrategy == IngestStrategy.Multiplex) + { + if (!await _lexicalChannel.TryWrite(doc, ctx) || !await _semanticChannel.TryWrite(doc, ctx)) + { + _logger.LogError("Failed to write OpenAPI document {Url}", doc.Url); + return false; + } + } + else + { + if (!await _lexicalChannel.TryWrite(doc, ctx)) + { + _logger.LogError("Failed to write OpenAPI document {Url}", doc.Url); + return false; + } + } + } + + _logger.LogInformation("Finished exporting OpenAPI documentation"); + return true; + } /// public void Dispose() diff --git a/src/Elastic.Markdown/Exporters/ExporterExtensions.cs b/src/Elastic.Markdown/Exporters/ExporterExtensions.cs index cbaf0ddae..ad7577bd5 100644 --- a/src/Elastic.Markdown/Exporters/ExporterExtensions.cs +++ b/src/Elastic.Markdown/Exporters/ExporterExtensions.cs @@ -24,7 +24,7 @@ string indexNamespace if (exportOptions.Contains(Exporter.Configuration)) markdownExporters.Add(new ConfigurationExporter(logFactory, context.ConfigurationFileProvider, context)); if (exportOptions.Contains(Exporter.Elasticsearch)) - markdownExporters.Add(new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints, indexNamespace, context.SynonymsConfiguration)); + markdownExporters.Add(new ElasticsearchMarkdownExporter(logFactory, context.Collector, context.Endpoints, indexNamespace, context)); return markdownExporters; } } diff --git a/src/Elastic.Markdown/Exporters/LlmMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/LlmMarkdownExporter.cs index c337f285b..2ac6eab21 100644 --- a/src/Elastic.Markdown/Exporters/LlmMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/LlmMarkdownExporter.cs @@ -87,7 +87,7 @@ await fileContext.SourceFile.SourceFile.FileSystem.File.WriteAllTextAsync( return true; } - public static string ConvertToLlmMarkdown(MarkdownDocument document, BuildContext context) => + public static string ConvertToLlmMarkdown(MarkdownDocument document, IDocumentationConfigurationContext context) => DocumentationObjectPoolProvider.UseLlmMarkdownRenderer(context, document, static (renderer, obj) => { _ = renderer.Render(obj); diff --git a/src/Elastic.Markdown/Helpers/DocumentationObjectPoolProvider.cs b/src/Elastic.Markdown/Helpers/DocumentationObjectPoolProvider.cs index ab5ae3185..7ed492479 100644 --- a/src/Elastic.Markdown/Helpers/DocumentationObjectPoolProvider.cs +++ b/src/Elastic.Markdown/Helpers/DocumentationObjectPoolProvider.cs @@ -21,7 +21,7 @@ internal static class DocumentationObjectPoolProvider public static readonly ObjectPool HtmlRendererPool = PoolProvider.Create(new HtmlRendererPooledObjectPolicy()); private static readonly ObjectPool LlmMarkdownRendererPool = PoolProvider.Create(new LlmMarkdownRendererPooledObjectPolicy()); - public static string UseLlmMarkdownRenderer(BuildContext buildContext, TContext context, Action action) + public static string UseLlmMarkdownRenderer(IDocumentationConfigurationContext buildContext, TContext context, Action action) { var subscription = LlmMarkdownRendererPool.Get(); subscription.SetBuildContext(buildContext); @@ -93,7 +93,7 @@ private sealed class LlmMarkdownRenderSubscription public required LlmMarkdownRenderer LlmMarkdownRenderer { get; init; } public StringBuilder? RentedStringBuilder { get; internal set; } - public void SetBuildContext(BuildContext buildContext) => LlmMarkdownRenderer.BuildContext = buildContext; + public void SetBuildContext(IDocumentationConfigurationContext buildContext) => LlmMarkdownRenderer.BuildContext = buildContext; } private sealed class LlmMarkdownRendererPooledObjectPolicy : IPooledObjectPolicy diff --git a/src/Elastic.Markdown/Myst/Directives/Version/VersionBlock.cs b/src/Elastic.Markdown/Myst/Directives/Version/VersionBlock.cs index f27679f77..87bab6f03 100644 --- a/src/Elastic.Markdown/Myst/Directives/Version/VersionBlock.cs +++ b/src/Elastic.Markdown/Myst/Directives/Version/VersionBlock.cs @@ -2,6 +2,7 @@ // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information +using System.Globalization; using Elastic.Documentation; using Elastic.Markdown.Diagnostics; using static System.StringSplitOptions; @@ -34,7 +35,7 @@ public override void FinalizeAndValidate(ParserContext context) } Version = version; - var title = Thread.CurrentThread.CurrentCulture.TextInfo.ToTitleCase(directive.Replace("version", "version ")); + var title = CultureInfo.InvariantCulture.TextInfo.ToTitleCase(directive.Replace("version", "version ")); title += $" ({Version})"; if (tokens.Length > 1 && !string.IsNullOrWhiteSpace(tokens[1])) title += $": {tokens[1]}"; diff --git a/src/Elastic.Markdown/Myst/Renderers/LlmMarkdown/LlmMarkdownRenderer.cs b/src/Elastic.Markdown/Myst/Renderers/LlmMarkdown/LlmMarkdownRenderer.cs index c3d5bfa30..53e12d0f5 100644 --- a/src/Elastic.Markdown/Myst/Renderers/LlmMarkdown/LlmMarkdownRenderer.cs +++ b/src/Elastic.Markdown/Myst/Renderers/LlmMarkdown/LlmMarkdownRenderer.cs @@ -13,7 +13,7 @@ namespace Elastic.Markdown.Myst.Renderers.LlmMarkdown; /// public class LlmMarkdownRenderer : TextRendererBase { - public required BuildContext BuildContext { get; set; } + public required IDocumentationConfigurationContext BuildContext { get; set; } private bool _isAtLineStart = true; /// diff --git a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs index 7d5d4e6de..20d6ed6f9 100644 --- a/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs +++ b/src/api/Elastic.Documentation.Api.Core/Search/SearchUsecase.cs @@ -68,6 +68,7 @@ public record SearchResultItemParent public record SearchResultItem { + public required string Type { get; init; } public required string Url { get; init; } public required string Title { get; init; } public required string Description { get; init; } diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index d2bbaaf33..670948149 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -25,6 +25,9 @@ internal sealed record DocumentDto [JsonPropertyName("description")] public string? Description { get; init; } + [JsonPropertyName("type")] + public string Type { get; init; } = "doc"; + [JsonPropertyName("body")] public string? Body { get; init; } @@ -134,6 +137,7 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger sf .Filter(f => f .Includes( + e => e.Type, e => e.Title, e => e.Url, e => e.Description, @@ -201,6 +205,7 @@ private static (int TotalHits, List Results) ProcessSearchResp { Url = doc.Url, Title = doc.Title, + Type = doc.Type, Description = doc.Description ?? string.Empty, Headings = doc.Headings, Parents = doc.Parents.Select(parent => new SearchResultItemParent diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/MockSearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/MockSearchGateway.cs index 2e5705986..ee425bd45 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/MockSearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/MockSearchGateway.cs @@ -12,6 +12,7 @@ public class MockSearchGateway : ISearchGateway [ new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/kibana", Title = "Kibana: Explore, Visualize, Discover Data", Description = @@ -20,6 +21,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs/explore-analyze", Title = "Explore and analyze | Elastic Docs", Description = "Kibana provides a comprehensive suite of tools to help you search, interact with, explore, and analyze your data effectively.", @@ -27,6 +29,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs/deploy-manage/deploy/self-managed/install-kibana", Title = "Install Kibana | Elastic Docs", Description = @@ -35,6 +38,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/kibana/kibana-lens", Title = "Kibana Lens – Data visualization. Simply.", Description = @@ -43,6 +47,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs", Title = "Elastic Docs – Elastic products, guides & reference", Description = @@ -51,6 +56,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs/get-started/introduction", Title = "Get started | Elastic Docs", Description = @@ -59,6 +65,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs/solutions/search/elasticsearch-basics-quickstart", Title = "Elasticsearch basics quickstart", Description = "Hands‑on introduction to fundamental Elasticsearch concepts: indices, documents, mappings, and search via Console syntax.", @@ -66,6 +73,7 @@ public class MockSearchGateway : ISearchGateway }, new SearchResultItem { + Type = "doc", Url = "https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-document", Title = "Elasticsearch API documentation", Description = diff --git a/src/services/Elastic.Documentation.Assembler/AssembleContext.cs b/src/services/Elastic.Documentation.Assembler/AssembleContext.cs index 2a97e79a2..82044b41d 100644 --- a/src/services/Elastic.Documentation.Assembler/AssembleContext.cs +++ b/src/services/Elastic.Documentation.Assembler/AssembleContext.cs @@ -34,6 +34,10 @@ public class AssembleContext : IDocumentationConfigurationContext public LegacyUrlMappingConfiguration LegacyUrlMappings { get; } public SynonymsConfiguration SynonymsConfiguration { get; } + // Always use the production URL. In case a page is leaked to a search engine, it should point to the production site. + /// + public Uri? CanonicalBaseUrl { get; } = new("https://www.elastic.co"); + public IDirectoryInfo CheckoutDirectory { get; } public IDirectoryInfo OutputDirectory { get; } diff --git a/tests/Directory.Build.props b/tests/Directory.Build.props index 48c968500..c9a34eb7a 100644 --- a/tests/Directory.Build.props +++ b/tests/Directory.Build.props @@ -6,7 +6,7 @@ false false true - CA1822 + CA1822;xUnit1004 false true diff --git a/tests/Elastic.ApiExplorer.Tests/OpenApiDocumentExporterTests.cs b/tests/Elastic.ApiExplorer.Tests/OpenApiDocumentExporterTests.cs new file mode 100644 index 000000000..f2d21da1c --- /dev/null +++ b/tests/Elastic.ApiExplorer.Tests/OpenApiDocumentExporterTests.cs @@ -0,0 +1,172 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Collections.Concurrent; +using Elastic.ApiExplorer.Elasticsearch; +using Elastic.Documentation; +using Elastic.Documentation.Configuration.Versions; +using Elastic.Documentation.Search; +using FluentAssertions; +using static System.StringComparison; + +namespace Elastic.ApiExplorer.Tests; + +public class OpenApiDocumentExporterTests +{ + private static readonly HttpClient HttpClient = new(); + private const string BaseUrl = "https://www.elastic.co"; + + [Fact(Skip = "This spams elastic.co, run this manually")] + public async Task ExportedDocumentUrlsShouldReturnSuccessStatusCode() + { + // Arrange + var versionsConfiguration = new VersionsConfiguration + { + VersioningSystems = new Dictionary + { + { + VersioningSystemId.Stack, + new VersioningSystem + { + Id = VersioningSystemId.Stack, + Base = new SemVersion(8, 0, 0), + Current = new SemVersion(9, 2, 0) + } + } + } + }; + + var exporter = new OpenApiDocumentExporter(versionsConfiguration); + const int limitPerSource = 300; // Get 50 from each source (Elasticsearch and Kibana) + + // Act - Collect all documents, tracking source + var documents = new List<(string Url, string Source)>(); + await foreach (var doc in exporter.ExportDocuments(limitPerSource, TestContext.Current.CancellationToken)) + { + if (!string.IsNullOrEmpty(doc.Url)) + { + // Determine source from URL + var source = doc.Url.Contains("/elasticsearch/") ? "elasticsearch" : "kibana"; + documents.Add((doc.Url, source)); + } + } + + // Assert we have documents from both sources + documents.Should().NotBeEmpty("the exporter should return at least some documents"); + var elasticsearchDocs = documents.Where(d => d.Source == "elasticsearch").ToList(); + var kibanaDocs = documents.Where(d => d.Source == "kibana").ToList(); + + elasticsearchDocs.Should().NotBeEmpty("should have Elasticsearch documents"); + kibanaDocs.Should().NotBeEmpty("should have Kibana documents"); + + // Take all documents as sample (already limited) + var sample = documents.Select(d => d.Url).ToList(); + + // Test each URL in parallel + var failures = new ConcurrentBag<(string Url, int StatusCode)>(); + + await Parallel.ForEachAsync(sample, + new ParallelOptions { MaxDegreeOfParallelism = 10, CancellationToken = TestContext.Current.CancellationToken }, + async (url, ct) => + { + var fullUrl = $"{BaseUrl}{url}"; + + try + { + using var request = new HttpRequestMessage(HttpMethod.Head, fullUrl); + + // Mimic browser headers + request.Headers.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"); + request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"); + request.Headers.Add("Accept-Language", "en-US,en;q=0.9"); + request.Headers.Add("Accept-Encoding", "gzip, deflate, br"); + request.Headers.Add("DNT", "1"); + request.Headers.Add("Connection", "keep-alive"); + request.Headers.Add("Upgrade-Insecure-Requests", "1"); + request.Headers.Add("Sec-Fetch-Dest", "document"); + request.Headers.Add("Sec-Fetch-Mode", "navigate"); + request.Headers.Add("Sec-Fetch-Site", "none"); + request.Headers.Add("Sec-Fetch-User", "?1"); + request.Headers.Add("Cache-Control", "max-age=0"); + + var response = await HttpClient.SendAsync( + request, + HttpCompletionOption.ResponseHeadersRead, + ct + ); + + if (!response.IsSuccessStatusCode) + { + failures.Add((url, (int)response.StatusCode)); + } + } + catch + { + failures.Add((url, -1)); // Use -1 to indicate exception + } + }); + + // Assert all URLs returned 200 + failures.Should().BeEmpty( + $"all sampled URLs should return 200 OK, but the following failed: {string.Join(", ", failures.Select(f => $"{f.Url} ({f.StatusCode})"))}" + ); + } + + [Fact] + public async Task DescriptionWithHtmlOperationsListShouldTransformToMarkdownAtEnd() + { + // Arrange + var versionsConfiguration = new VersionsConfiguration + { + VersioningSystems = new Dictionary + { + { + VersioningSystemId.Stack, + new VersioningSystem + { + Id = VersioningSystemId.Stack, + Base = new SemVersion(8, 0, 0), + Current = new SemVersion(9, 2, 0) + } + } + } + }; + + var exporter = new OpenApiDocumentExporter(versionsConfiguration); + + // Act - Get some Elasticsearch documents + var documents = new List(); + await foreach (var doc in exporter.ExportDocuments(limitPerSource: 100, TestContext.Current.CancellationToken)) + { + if (doc.Description != null && doc.Description.Contains("**All methods and paths for this operation:**")) + { + documents.Add(doc); + } + } + + // Assert we found at least one document with the pattern + documents.Should().NotBeEmpty("there should be at least one document with operation list"); + + foreach (var doc in documents) + { + // Should not contain HTML + doc.Description.Should().NotContain("
", "HTML should be converted to markdown"); + doc.Description.Should().NotContain(" !string.IsNullOrWhiteSpace(l)).TakeLast(5).ToList(); + + // At least one of the last few lines should be a Markdown list item + var hasMarkdownListAtEnd = lastNonEmptyLines.Any(l => l.StartsWith("- **", InvariantCulture)); + hasMarkdownListAtEnd.Should().BeTrue( + $"markdown list should be at the end of the description. Last lines:\n{string.Join("\n", lastNonEmptyLines)}\n\nFull description:\n{doc.Description}" + ); + } + } +}