diff --git a/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs b/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs new file mode 100644 index 000000000..25af91bef --- /dev/null +++ b/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs @@ -0,0 +1,127 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Buffers; +using Elastic.Markdown.Diagnostics; +using Markdig; +using Markdig.Helpers; +using Markdig.Parsers; +using Markdig.Parsers.Inlines; +using Markdig.Renderers; +using Markdig.Renderers.Html; +using Markdig.Renderers.Html.Inlines; +using Markdig.Syntax.Inlines; + +namespace Elastic.Markdown.Myst.Linters; + +public static class WhiteSpaceNormalizerBuilderExtensions +{ + public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline) + { + pipeline.Extensions.AddIfNotAlready(); + return pipeline; + } +} + +public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension +{ + public void Setup(MarkdownPipelineBuilder pipeline) => + pipeline.InlineParsers.InsertBefore(new WhiteSpaceNormalizerParser()); + + public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) => + renderer.ObjectRenderers.InsertAfter(new WhiteSpaceNormalizerRenderer()); +} + +public class WhiteSpaceNormalizerParser : InlineParser +{ + // Collection of irregular whitespace characters that may impair Markdown rendering + private static readonly char[] IrregularWhitespaceChars = + [ + '\u000B', // Line Tabulation (\v) - + '\u000C', // Form Feed (\f) - + '\u00A0', // No-Break Space - + '\u0085', // Next Line + '\u1680', // Ogham Space Mark + '\u180E', // Mongolian Vowel Separator - + '\ufeff', // Zero Width No-Break Space - + '\u2000', // En Quad + '\u2001', // Em Quad + '\u2002', // En Space - + '\u2003', // Em Space - + '\u2004', // Tree-Per-Em + '\u2005', // Four-Per-Em + '\u2006', // Six-Per-Em + '\u2007', // Figure Space + '\u2008', // Punctuation Space - + '\u2009', // Thin Space + '\u200A', // Hair Space + '\u200B', // Zero Width Space - + '\u2028', // Line Separator + '\u2029', // Paragraph Separator + '\u202F', // Narrow No-Break Space + '\u205F', // Medium Mathematical Space + '\u3000' // Ideographic Space + ]; + private static readonly SearchValues WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars); + + public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars; + + public override bool Match(InlineProcessor processor, ref StringSlice slice) + { + var span = slice.AsSpan().Slice(0, 1); + if (span.IndexOfAny(WhiteSpaceSearchValues) == -1) + return false; + + processor.Inline = IrregularWhiteSpace.Instance; + + var c = span[0]; + var charName = GetCharacterName(c); + + processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."); + + slice.SkipChar(); + return true; + } + + // Helper to get a friendly name for the whitespace character + private static string GetCharacterName(char c) => c switch + { + '\u000B' => "Line Tabulation (VT)", + '\u000C' => "Form Feed (FF)", + '\u00A0' => "No-Break Space (NBSP)", + '\u0085' => "Next Line", + '\u1680' => "Ogham Space Mark", + '\u180E' => "Mongolian Vowel Separator (MVS)", + '\ufeff' => "Zero Width No-Break Space (BOM)", + '\u2000' => "En Quad", + '\u2001' => "Em Quad", + '\u2002' => "En Space (ENSP)", + '\u2003' => "Em Space (EMSP)", + '\u2004' => "Tree-Per-Em", + '\u2005' => "Four-Per-Em", + '\u2006' => "Six-Per-Em", + '\u2007' => "Figure Space", + '\u2008' => "Punctuation Space (PUNCSP)", + '\u2009' => "Thin Space", + '\u200A' => "Hair Space", + '\u200B' => "Zero Width Space (ZWSP)", + '\u2028' => "Line Separator", + '\u2029' => "Paragraph Separator", + '\u202F' => "Narrow No-Break Space", + '\u205F' => "Medium Mathematical Space", + '\u3000' => "Ideographic Space", + _ => "Unknown" + }; +} + +public class IrregularWhiteSpace : LeafInline +{ + public static readonly IrregularWhiteSpace Instance = new(); +}; + +public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer +{ + protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) => + renderer.Write(' '); +} diff --git a/src/Elastic.Markdown/Myst/MarkdownParser.cs b/src/Elastic.Markdown/Myst/MarkdownParser.cs index 24dfd6010..1feafcbf8 100644 --- a/src/Elastic.Markdown/Myst/MarkdownParser.cs +++ b/src/Elastic.Markdown/Myst/MarkdownParser.cs @@ -3,16 +3,22 @@ // See the LICENSE file in the project root for more information using System.IO.Abstractions; +using System.Text.RegularExpressions; + using Cysharp.IO; + +using Elastic.Documentation.Diagnostics; using Elastic.Markdown.Myst.CodeBlocks; using Elastic.Markdown.Myst.Comments; using Elastic.Markdown.Myst.Directives; using Elastic.Markdown.Myst.FrontMatter; using Elastic.Markdown.Myst.InlineParsers; using Elastic.Markdown.Myst.InlineParsers.Substitution; +using Elastic.Markdown.Myst.Linters; using Elastic.Markdown.Myst.Renderers; using Elastic.Markdown.Myst.Roles; using Elastic.Markdown.Myst.Roles.AppliesTo; + using Markdig; using Markdig.Extensions.EmphasisExtras; using Markdig.Parsers; @@ -92,20 +98,18 @@ private static async Task ParseAsync( MarkdownPipeline pipeline, Cancel ctx) { + string inputMarkdown; if (path.FileSystem is FileSystem) { //real IO optimize through UTF8 stream reader. await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput); - var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx); - var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); - return markdownDocument; + inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx); } else - { - var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); - var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); - return markdownDocument; - } + inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); + + var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); + return markdownDocument; } // ReSharper disable once InconsistentNaming @@ -156,6 +160,7 @@ public MarkdownPipeline Pipeline .UseEnhancedCodeBlocks() .UseHtmxLinkInlineRenderer() .DisableHtml() + .UseWhiteSpaceNormalizer() .UseHardBreaks(); _ = builder.BlockParsers.TryRemove(); _pipelineCached = builder.Build(); diff --git a/tests/authoring/Framework/ErrorCollectorAssertions.fs b/tests/authoring/Framework/ErrorCollectorAssertions.fs index 9e844f67a..2557f1b33 100644 --- a/tests/authoring/Framework/ErrorCollectorAssertions.fs +++ b/tests/authoring/Framework/ErrorCollectorAssertions.fs @@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions = | Some e -> let message = e.Message test <@ message.Contains(expected) @> - | None -> failwithf "Expected errors but no errors were logged" + | None -> failwithf "Expected warnings but no warnings were logged" + + [] + let hasHint (expected: string) (actual: Lazy) = + let actual = actual.Value + actual.Context.Collector.Hints |> shouldBeGreaterThan 0 + let errorDiagnostics = actual.Context.Collector.Diagnostics + .Where(fun d -> d.Severity = Severity.Hint) + .ToArray() + |> List.ofArray + |> List.tryHead + match errorDiagnostics with + | Some e -> + let message = e.Message + test <@ message.Contains(expected) @> + | None -> failwithf "Expected hints but no hints were logged" diff --git a/tests/authoring/Framework/TestValues.fs b/tests/authoring/Framework/TestValues.fs index 549083f63..8ec8e319c 100644 --- a/tests/authoring/Framework/TestValues.fs +++ b/tests/authoring/Framework/TestValues.fs @@ -26,8 +26,10 @@ type TestDiagnosticsOutput() = match diagnostic.Severity with | Severity.Error -> output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})") - | _ -> + | Severity.Warning -> output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})") + | _ -> + output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})") | _ -> () diff --git a/tests/authoring/Inline/Comments.fs b/tests/authoring/Inline/Comments.fs index a05bb87a6..c325beb90 100644 --- a/tests/authoring/Inline/Comments.fs +++ b/tests/authoring/Inline/Comments.fs @@ -17,3 +17,4 @@ not a comment [] let ``validate HTML: commented line should not be emitted`` () = markdown |> convertsToHtml """

not a comment

""" + diff --git a/tests/authoring/Linters/WhiteSpaceNormalizers.fs b/tests/authoring/Linters/WhiteSpaceNormalizers.fs new file mode 100644 index 000000000..3f8e44ddd --- /dev/null +++ b/tests/authoring/Linters/WhiteSpaceNormalizers.fs @@ -0,0 +1,23 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +module ``linters``.``white space normalizers`` + +open Xunit +open authoring + + +type ``white space detection`` () = + + static let markdown = Setup.Markdown $""" +not a{'\u000B'}space +""" + + [] + let ``validate HTML: should not contain bad space character`` () = + markdown |> convertsToHtml """

not a space

""" + + [] + let ``emits a hint when a bad space is used`` () = + markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering." diff --git a/tests/authoring/authoring.fsproj b/tests/authoring/authoring.fsproj index aa289193f..2520c7227 100644 --- a/tests/authoring/authoring.fsproj +++ b/tests/authoring/authoring.fsproj @@ -58,4 +58,8 @@ + + + +