From 3050c5cd8456def7f32e603b5d0fc0ee7356d348 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 10:25:39 +0200 Subject: [PATCH 1/8] Add Irregular Whitespace detection --- src/Elastic.Markdown/Myst/MarkdownParser.cs | 119 ++++++++++++++++++-- 1 file changed, 108 insertions(+), 11 deletions(-) diff --git a/src/Elastic.Markdown/Myst/MarkdownParser.cs b/src/Elastic.Markdown/Myst/MarkdownParser.cs index 24dfd6010..c981f3614 100644 --- a/src/Elastic.Markdown/Myst/MarkdownParser.cs +++ b/src/Elastic.Markdown/Myst/MarkdownParser.cs @@ -4,6 +4,8 @@ using System.IO.Abstractions; using Cysharp.IO; +using Elastic.Documentation.Diagnostics; +using System.Text.RegularExpressions; using Elastic.Markdown.Myst.CodeBlocks; using Elastic.Markdown.Myst.Comments; using Elastic.Markdown.Myst.Directives; @@ -25,6 +27,92 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers) private BuildContext Build { get; } = build; private IParserResolvers Resolvers { get; } = resolvers; + // Collection of irregular whitespace characters that may impair Markdown rendering + private static readonly char[] IrregularWhitespaceChars = { + '\u000B', // Line Tabulation (\v) - + '\u000C', // Form Feed (\f) - + '\u00A0', // No-Break Space - + '\u0085', // Next Line + '\u1680', // Ogham Space Mark + '\u180E', // Mongolian Vowel Separator - + '\ufeff', // Zero Width No-Break Space - + '\u2000', // En Quad + '\u2001', // Em Quad + '\u2002', // En Space - + '\u2003', // Em Space - + '\u2004', // Tree-Per-Em + '\u2005', // Four-Per-Em + '\u2006', // Six-Per-Em + '\u2007', // Figure Space + '\u2008', // Punctuation Space - + '\u2009', // Thin Space + '\u200A', // Hair Space + '\u200B', // Zero Width Space - + '\u2028', // Line Separator + '\u2029', // Paragraph Separator + '\u202F', // Narrow No-Break Space + '\u205F', // Medium Mathematical Space + '\u3000' // Ideographic Space + }; + + // Detects irregular whitespace in the markdown content and reports diagnostics + private void DetectIrregularWhitespace(string content, string filePath) + { + var lines = content.Split(new[] { "\r\n", "\n", "\r" }, StringSplitOptions.None); + + for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++) + { + var line = lines[lineIndex]; + for (var columnIndex = 0; columnIndex < line.Length; columnIndex++) + { + var c = line[columnIndex]; + if (Array.IndexOf(IrregularWhitespaceChars, c) >= 0) + { + var charName = GetCharacterName(c); + Build.Collector.Write(new Diagnostic + { + Severity = Severity.Warning, + File = filePath, + Line = lineIndex + 1, // 1-based line number + Column = columnIndex + 1, // 1-based column number + Length = 1, + Message = $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering." + }); + } + } + } + } + + // Helper to get a friendly name for the whitespace character + private static string GetCharacterName(char c) => c switch + { + '\u000B' => "Line Tabulation (VT)", + '\u000C' => "Form Feed (FF)", + '\u00A0' => "No-Break Space (NBSP)", + '\u0085' => "Next Line", + '\u1680' => "Ogham Space Mark", + '\u180E' => "Mongolian Vowel Separator (MVS)", + '\ufeff' => "Zero Width No-Break Space (BOM)", + '\u2000' => "En Quad", + '\u2001' => "Em Quad", + '\u2002' => "En Space (ENSP)", + '\u2003' => "Em Space (EMSP)", + '\u2004' => "Tree-Per-Em", + '\u2005' => "Four-Per-Em", + '\u2006' => "Six-Per-Em", + '\u2007' => "Figure Space", + '\u2008' => "Punctuation Space (PUNCSP)", + '\u2009' => "Thin Space", + '\u200A' => "Hair Space", + '\u200B' => "Zero Width Space (ZWSP)", + '\u2028' => "Line Separator", + '\u2029' => "Paragraph Separator", + '\u202F' => "Narrow No-Break Space", + '\u205F' => "Medium Mathematical Space", + '\u3000' => "Ideographic Space", + _ => "Unknown" + }; + public Task MinimalParseAsync(IFileInfo path, Cancel ctx) { var state = new ParserState(Build) @@ -66,11 +154,17 @@ public Task ParseSnippetAsync(IFileInfo path, IFileInfo parent return ParseAsync(path, context, Pipeline, ctx); } - public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) => - ParseMarkdownStringAsync(markdown, path, matter, Pipeline); + public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) + { + DetectIrregularWhitespace(markdown, path.FullName); + return ParseMarkdownStringAsync(markdown, path, matter, Pipeline); + } - public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) => - ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline); + public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) + { + DetectIrregularWhitespace(markdown, path.FullName); + return ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline); + } private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter, MarkdownPipeline pipeline) { @@ -86,26 +180,29 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat return markdownDocument; } - private static async Task ParseAsync( + private async Task ParseAsync( IFileInfo path, MarkdownParserContext context, MarkdownPipeline pipeline, Cancel ctx) { + string inputMarkdown; if (path.FileSystem is FileSystem) { //real IO optimize through UTF8 stream reader. await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput); - var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx); - var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); - return markdownDocument; + inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx); } else { - var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); - var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); - return markdownDocument; + inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); } + + // Check for irregular whitespace characters + DetectIrregularWhitespace(inputMarkdown, path.FullName); + + var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); + return markdownDocument; } // ReSharper disable once InconsistentNaming From aa53b9a1df6e4aa8f4a37848492b16f3f25ad75d Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 10:38:12 +0200 Subject: [PATCH 2/8] Format --- src/Elastic.Markdown/Myst/MarkdownParser.cs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/Elastic.Markdown/Myst/MarkdownParser.cs b/src/Elastic.Markdown/Myst/MarkdownParser.cs index c981f3614..bbe747d2f 100644 --- a/src/Elastic.Markdown/Myst/MarkdownParser.cs +++ b/src/Elastic.Markdown/Myst/MarkdownParser.cs @@ -3,9 +3,11 @@ // See the LICENSE file in the project root for more information using System.IO.Abstractions; +using System.Text.RegularExpressions; + using Cysharp.IO; + using Elastic.Documentation.Diagnostics; -using System.Text.RegularExpressions; using Elastic.Markdown.Myst.CodeBlocks; using Elastic.Markdown.Myst.Comments; using Elastic.Markdown.Myst.Directives; @@ -15,6 +17,7 @@ using Elastic.Markdown.Myst.Renderers; using Elastic.Markdown.Myst.Roles; using Elastic.Markdown.Myst.Roles.AppliesTo; + using Markdig; using Markdig.Extensions.EmphasisExtras; using Markdig.Parsers; @@ -28,7 +31,8 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers) private IParserResolvers Resolvers { get; } = resolvers; // Collection of irregular whitespace characters that may impair Markdown rendering - private static readonly char[] IrregularWhitespaceChars = { + private static readonly char[] IrregularWhitespaceChars = + [ '\u000B', // Line Tabulation (\v) - '\u000C', // Form Feed (\f) - '\u00A0', // No-Break Space - @@ -53,13 +57,13 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers) '\u202F', // Narrow No-Break Space '\u205F', // Medium Mathematical Space '\u3000' // Ideographic Space - }; + ]; // Detects irregular whitespace in the markdown content and reports diagnostics private void DetectIrregularWhitespace(string content, string filePath) { - var lines = content.Split(new[] { "\r\n", "\n", "\r" }, StringSplitOptions.None); - + var lines = content.Split(["\r\n", "\n", "\r"], StringSplitOptions.None); + for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++) { var line = lines[lineIndex]; @@ -82,7 +86,7 @@ private void DetectIrregularWhitespace(string content, string filePath) } } } - + // Helper to get a friendly name for the whitespace character private static string GetCharacterName(char c) => c switch { @@ -113,6 +117,7 @@ private void DetectIrregularWhitespace(string content, string filePath) _ => "Unknown" }; + public Task MinimalParseAsync(IFileInfo path, Cancel ctx) { var state = new ParserState(Build) @@ -197,10 +202,10 @@ private async Task ParseAsync( { inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); } - + // Check for irregular whitespace characters DetectIrregularWhitespace(inputMarkdown, path.FullName); - + var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); return markdownDocument; } From 0d031be54706b7a1ae72d7680a8125cb6f24d01e Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 10:42:17 +0200 Subject: [PATCH 3/8] Add test --- .../TextFormat/IrregularWhitespaceTest.cs | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs diff --git a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs new file mode 100644 index 000000000..4b390c045 --- /dev/null +++ b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs @@ -0,0 +1,119 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using Elastic.Documentation.Diagnostics; +using Elastic.Markdown.Tests.Inline; +using FluentAssertions; + +namespace Elastic.Markdown.Tests.TextFormat; + +public class IrregularWhitespaceTest(ITestOutputHelper output) : InlineTest(output, +""" +# Heading with no-break space\u00A0character + +This is a paragraph with some\u2002en space and\u200Bzero width space. + +## Subheading with\u3000ideographic space + +* List item with\u00A0no-break space +* Another item + +```csharp +// Code with\u00A0no-break space +var x = 1; +``` + +> Blockquote with\u2003em space +""" +) +{ + [Fact] + public void DetectsIrregularWhitespaceInHeading() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 1) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInParagraph() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 3) + .Where(d => d.Message.Contains("irregular whitespace")) + .ToList(); + + diagnostics.Should().HaveCountGreaterThanOrEqualTo(2); + + // Verify en space detection + diagnostics.Should().Contain(d => d.Message.Contains("U+2002")); + + // Verify zero width space detection + diagnostics.Should().Contain(d => d.Message.Contains("U+200B")); + } + + [Fact] + public void DetectsIrregularWhitespaceInSubheading() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 5) + .Where(d => d.Message.Contains("U+3000")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInListItem() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 7) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInCodeBlock() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 11) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInBlockquote() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 15) + .Where(d => d.Message.Contains("U+2003")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void GeneratesProperWarningMessage() + { + var noBreakSpaceWarning = Collector.Diagnostics + .FirstOrDefault(d => d.Message.Contains("U+00A0")); + + noBreakSpaceWarning.Should().NotBeNull(); + noBreakSpaceWarning!.Message.Should() + .Contain("Irregular whitespace character detected: U+00A0 (No-Break Space (NBSP))") + .And.Contain("may impair Markdown rendering"); + } +} From 50180622755e65ae7c5cc42ba1273adbd8490bbf Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 10:55:16 +0200 Subject: [PATCH 4/8] Formatting --- .../TextFormat/IrregularWhitespaceTest.cs | 181 +++++++++--------- 1 file changed, 93 insertions(+), 88 deletions(-) diff --git a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs index 4b390c045..3ea54e9f7 100644 --- a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs +++ b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs @@ -2,10 +2,15 @@ // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information +using System.Linq; + using Elastic.Documentation.Diagnostics; using Elastic.Markdown.Tests.Inline; using FluentAssertions; +using Xunit; +using Xunit.Abstractions; + namespace Elastic.Markdown.Tests.TextFormat; public class IrregularWhitespaceTest(ITestOutputHelper output) : InlineTest(output, @@ -28,92 +33,92 @@ This is a paragraph with some\u2002en space and\u200Bzero width space. """ ) { - [Fact] - public void DetectsIrregularWhitespaceInHeading() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 1) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInParagraph() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 3) - .Where(d => d.Message.Contains("irregular whitespace")) - .ToList(); - - diagnostics.Should().HaveCountGreaterThanOrEqualTo(2); - - // Verify en space detection - diagnostics.Should().Contain(d => d.Message.Contains("U+2002")); - - // Verify zero width space detection - diagnostics.Should().Contain(d => d.Message.Contains("U+200B")); - } - - [Fact] - public void DetectsIrregularWhitespaceInSubheading() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 5) - .Where(d => d.Message.Contains("U+3000")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInListItem() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 7) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInCodeBlock() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 11) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInBlockquote() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 15) - .Where(d => d.Message.Contains("U+2003")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void GeneratesProperWarningMessage() - { - var noBreakSpaceWarning = Collector.Diagnostics - .FirstOrDefault(d => d.Message.Contains("U+00A0")); - - noBreakSpaceWarning.Should().NotBeNull(); - noBreakSpaceWarning!.Message.Should() - .Contain("Irregular whitespace character detected: U+00A0 (No-Break Space (NBSP))") - .And.Contain("may impair Markdown rendering"); - } + [Fact] + public void DetectsIrregularWhitespaceInHeading() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 1) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInParagraph() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 3) + .Where(d => d.Message.Contains("irregular whitespace")) + .ToList(); + + diagnostics.Should().HaveCountGreaterThanOrEqualTo(2); + + // Verify en space detection + diagnostics.Should().Contain(d => d.Message.Contains("U+2002")); + + // Verify zero width space detection + diagnostics.Should().Contain(d => d.Message.Contains("U+200B")); + } + + [Fact] + public void DetectsIrregularWhitespaceInSubheading() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 5) + .Where(d => d.Message.Contains("U+3000")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInListItem() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 7) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInCodeBlock() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 11) + .Where(d => d.Message.Contains("U+00A0")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void DetectsIrregularWhitespaceInBlockquote() + { + var diagnostics = Collector.Diagnostics + .Where(d => d.Line == 15) + .Where(d => d.Message.Contains("U+2003")) + .ToList(); + + diagnostics.Should().HaveCount(1); + diagnostics[0].Severity.Should().Be(Severity.Warning); + } + + [Fact] + public void GeneratesProperWarningMessage() + { + var noBreakSpaceWarning = Collector.Diagnostics + .FirstOrDefault(d => d.Message.Contains("U+00A0")); + + noBreakSpaceWarning.Should().NotBeNull(); + noBreakSpaceWarning!.Message.Should() + .Contain("Irregular whitespace character detected: U+00A0 (No-Break Space (NBSP))") + .And.Contain("may impair Markdown rendering"); + } } From f4464dc3e32bb87a5a12929885ca52a9b6abcdf8 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 10:57:22 +0200 Subject: [PATCH 5/8] Remove unused imports --- .../TextFormat/IrregularWhitespaceTest.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs index 3ea54e9f7..0dafef3c0 100644 --- a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs +++ b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs @@ -3,14 +3,10 @@ // See the LICENSE file in the project root for more information using System.Linq; - using Elastic.Documentation.Diagnostics; using Elastic.Markdown.Tests.Inline; using FluentAssertions; -using Xunit; -using Xunit.Abstractions; - namespace Elastic.Markdown.Tests.TextFormat; public class IrregularWhitespaceTest(ITestOutputHelper output) : InlineTest(output, From 9b5793d07f791f4a9307c8d09871eb9315946a96 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferri Benedetti Date: Fri, 16 May 2025 11:23:37 +0200 Subject: [PATCH 6/8] Remove test --- .../TextFormat/IrregularWhitespaceTest.cs | 120 ------------------ 1 file changed, 120 deletions(-) delete mode 100644 tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs diff --git a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs b/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs deleted file mode 100644 index 0dafef3c0..000000000 --- a/tests/Elastic.Markdown.Tests/TextFormat/IrregularWhitespaceTest.cs +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to Elasticsearch B.V under one or more agreements. -// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information - -using System.Linq; -using Elastic.Documentation.Diagnostics; -using Elastic.Markdown.Tests.Inline; -using FluentAssertions; - -namespace Elastic.Markdown.Tests.TextFormat; - -public class IrregularWhitespaceTest(ITestOutputHelper output) : InlineTest(output, -""" -# Heading with no-break space\u00A0character - -This is a paragraph with some\u2002en space and\u200Bzero width space. - -## Subheading with\u3000ideographic space - -* List item with\u00A0no-break space -* Another item - -```csharp -// Code with\u00A0no-break space -var x = 1; -``` - -> Blockquote with\u2003em space -""" -) -{ - [Fact] - public void DetectsIrregularWhitespaceInHeading() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 1) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInParagraph() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 3) - .Where(d => d.Message.Contains("irregular whitespace")) - .ToList(); - - diagnostics.Should().HaveCountGreaterThanOrEqualTo(2); - - // Verify en space detection - diagnostics.Should().Contain(d => d.Message.Contains("U+2002")); - - // Verify zero width space detection - diagnostics.Should().Contain(d => d.Message.Contains("U+200B")); - } - - [Fact] - public void DetectsIrregularWhitespaceInSubheading() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 5) - .Where(d => d.Message.Contains("U+3000")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInListItem() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 7) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInCodeBlock() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 11) - .Where(d => d.Message.Contains("U+00A0")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void DetectsIrregularWhitespaceInBlockquote() - { - var diagnostics = Collector.Diagnostics - .Where(d => d.Line == 15) - .Where(d => d.Message.Contains("U+2003")) - .ToList(); - - diagnostics.Should().HaveCount(1); - diagnostics[0].Severity.Should().Be(Severity.Warning); - } - - [Fact] - public void GeneratesProperWarningMessage() - { - var noBreakSpaceWarning = Collector.Diagnostics - .FirstOrDefault(d => d.Message.Contains("U+00A0")); - - noBreakSpaceWarning.Should().NotBeNull(); - noBreakSpaceWarning!.Message.Should() - .Contain("Irregular whitespace character detected: U+00A0 (No-Break Space (NBSP))") - .And.Contain("may impair Markdown rendering"); - } -} From 806a147ed1b85fcfe14a733a8c49ebc06c542fa2 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Mon, 19 May 2025 14:10:34 +0200 Subject: [PATCH 7/8] Move detecting irregular whitespace to a markdown parser and add tests --- .../Myst/Linters/WhiteSpaceNormalizer.cs | 127 ++++++++++++++++++ src/Elastic.Markdown/Myst/MarkdownParser.cs | 111 +-------------- .../Framework/ErrorCollectorAssertions.fs | 17 ++- tests/authoring/Framework/TestValues.fs | 4 +- tests/authoring/Inline/Comments.fs | 1 + .../Linters/WhiteSpaceNormalizers.fs | 19 +++ tests/authoring/authoring.fsproj | 4 + 7 files changed, 177 insertions(+), 106 deletions(-) create mode 100644 src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs create mode 100644 tests/authoring/Linters/WhiteSpaceNormalizers.fs diff --git a/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs b/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs new file mode 100644 index 000000000..25af91bef --- /dev/null +++ b/src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs @@ -0,0 +1,127 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Buffers; +using Elastic.Markdown.Diagnostics; +using Markdig; +using Markdig.Helpers; +using Markdig.Parsers; +using Markdig.Parsers.Inlines; +using Markdig.Renderers; +using Markdig.Renderers.Html; +using Markdig.Renderers.Html.Inlines; +using Markdig.Syntax.Inlines; + +namespace Elastic.Markdown.Myst.Linters; + +public static class WhiteSpaceNormalizerBuilderExtensions +{ + public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline) + { + pipeline.Extensions.AddIfNotAlready(); + return pipeline; + } +} + +public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension +{ + public void Setup(MarkdownPipelineBuilder pipeline) => + pipeline.InlineParsers.InsertBefore(new WhiteSpaceNormalizerParser()); + + public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) => + renderer.ObjectRenderers.InsertAfter(new WhiteSpaceNormalizerRenderer()); +} + +public class WhiteSpaceNormalizerParser : InlineParser +{ + // Collection of irregular whitespace characters that may impair Markdown rendering + private static readonly char[] IrregularWhitespaceChars = + [ + '\u000B', // Line Tabulation (\v) - + '\u000C', // Form Feed (\f) - + '\u00A0', // No-Break Space - + '\u0085', // Next Line + '\u1680', // Ogham Space Mark + '\u180E', // Mongolian Vowel Separator - + '\ufeff', // Zero Width No-Break Space - + '\u2000', // En Quad + '\u2001', // Em Quad + '\u2002', // En Space - + '\u2003', // Em Space - + '\u2004', // Tree-Per-Em + '\u2005', // Four-Per-Em + '\u2006', // Six-Per-Em + '\u2007', // Figure Space + '\u2008', // Punctuation Space - + '\u2009', // Thin Space + '\u200A', // Hair Space + '\u200B', // Zero Width Space - + '\u2028', // Line Separator + '\u2029', // Paragraph Separator + '\u202F', // Narrow No-Break Space + '\u205F', // Medium Mathematical Space + '\u3000' // Ideographic Space + ]; + private static readonly SearchValues WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars); + + public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars; + + public override bool Match(InlineProcessor processor, ref StringSlice slice) + { + var span = slice.AsSpan().Slice(0, 1); + if (span.IndexOfAny(WhiteSpaceSearchValues) == -1) + return false; + + processor.Inline = IrregularWhiteSpace.Instance; + + var c = span[0]; + var charName = GetCharacterName(c); + + processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering."); + + slice.SkipChar(); + return true; + } + + // Helper to get a friendly name for the whitespace character + private static string GetCharacterName(char c) => c switch + { + '\u000B' => "Line Tabulation (VT)", + '\u000C' => "Form Feed (FF)", + '\u00A0' => "No-Break Space (NBSP)", + '\u0085' => "Next Line", + '\u1680' => "Ogham Space Mark", + '\u180E' => "Mongolian Vowel Separator (MVS)", + '\ufeff' => "Zero Width No-Break Space (BOM)", + '\u2000' => "En Quad", + '\u2001' => "Em Quad", + '\u2002' => "En Space (ENSP)", + '\u2003' => "Em Space (EMSP)", + '\u2004' => "Tree-Per-Em", + '\u2005' => "Four-Per-Em", + '\u2006' => "Six-Per-Em", + '\u2007' => "Figure Space", + '\u2008' => "Punctuation Space (PUNCSP)", + '\u2009' => "Thin Space", + '\u200A' => "Hair Space", + '\u200B' => "Zero Width Space (ZWSP)", + '\u2028' => "Line Separator", + '\u2029' => "Paragraph Separator", + '\u202F' => "Narrow No-Break Space", + '\u205F' => "Medium Mathematical Space", + '\u3000' => "Ideographic Space", + _ => "Unknown" + }; +} + +public class IrregularWhiteSpace : LeafInline +{ + public static readonly IrregularWhiteSpace Instance = new(); +}; + +public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer +{ + protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) => + renderer.Write(' '); +} diff --git a/src/Elastic.Markdown/Myst/MarkdownParser.cs b/src/Elastic.Markdown/Myst/MarkdownParser.cs index bbe747d2f..1feafcbf8 100644 --- a/src/Elastic.Markdown/Myst/MarkdownParser.cs +++ b/src/Elastic.Markdown/Myst/MarkdownParser.cs @@ -14,6 +14,7 @@ using Elastic.Markdown.Myst.FrontMatter; using Elastic.Markdown.Myst.InlineParsers; using Elastic.Markdown.Myst.InlineParsers.Substitution; +using Elastic.Markdown.Myst.Linters; using Elastic.Markdown.Myst.Renderers; using Elastic.Markdown.Myst.Roles; using Elastic.Markdown.Myst.Roles.AppliesTo; @@ -30,94 +31,6 @@ public class MarkdownParser(BuildContext build, IParserResolvers resolvers) private BuildContext Build { get; } = build; private IParserResolvers Resolvers { get; } = resolvers; - // Collection of irregular whitespace characters that may impair Markdown rendering - private static readonly char[] IrregularWhitespaceChars = - [ - '\u000B', // Line Tabulation (\v) - - '\u000C', // Form Feed (\f) - - '\u00A0', // No-Break Space - - '\u0085', // Next Line - '\u1680', // Ogham Space Mark - '\u180E', // Mongolian Vowel Separator - - '\ufeff', // Zero Width No-Break Space - - '\u2000', // En Quad - '\u2001', // Em Quad - '\u2002', // En Space - - '\u2003', // Em Space - - '\u2004', // Tree-Per-Em - '\u2005', // Four-Per-Em - '\u2006', // Six-Per-Em - '\u2007', // Figure Space - '\u2008', // Punctuation Space - - '\u2009', // Thin Space - '\u200A', // Hair Space - '\u200B', // Zero Width Space - - '\u2028', // Line Separator - '\u2029', // Paragraph Separator - '\u202F', // Narrow No-Break Space - '\u205F', // Medium Mathematical Space - '\u3000' // Ideographic Space - ]; - - // Detects irregular whitespace in the markdown content and reports diagnostics - private void DetectIrregularWhitespace(string content, string filePath) - { - var lines = content.Split(["\r\n", "\n", "\r"], StringSplitOptions.None); - - for (var lineIndex = 0; lineIndex < lines.Length; lineIndex++) - { - var line = lines[lineIndex]; - for (var columnIndex = 0; columnIndex < line.Length; columnIndex++) - { - var c = line[columnIndex]; - if (Array.IndexOf(IrregularWhitespaceChars, c) >= 0) - { - var charName = GetCharacterName(c); - Build.Collector.Write(new Diagnostic - { - Severity = Severity.Warning, - File = filePath, - Line = lineIndex + 1, // 1-based line number - Column = columnIndex + 1, // 1-based column number - Length = 1, - Message = $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering." - }); - } - } - } - } - - // Helper to get a friendly name for the whitespace character - private static string GetCharacterName(char c) => c switch - { - '\u000B' => "Line Tabulation (VT)", - '\u000C' => "Form Feed (FF)", - '\u00A0' => "No-Break Space (NBSP)", - '\u0085' => "Next Line", - '\u1680' => "Ogham Space Mark", - '\u180E' => "Mongolian Vowel Separator (MVS)", - '\ufeff' => "Zero Width No-Break Space (BOM)", - '\u2000' => "En Quad", - '\u2001' => "Em Quad", - '\u2002' => "En Space (ENSP)", - '\u2003' => "Em Space (EMSP)", - '\u2004' => "Tree-Per-Em", - '\u2005' => "Four-Per-Em", - '\u2006' => "Six-Per-Em", - '\u2007' => "Figure Space", - '\u2008' => "Punctuation Space (PUNCSP)", - '\u2009' => "Thin Space", - '\u200A' => "Hair Space", - '\u200B' => "Zero Width Space (ZWSP)", - '\u2028' => "Line Separator", - '\u2029' => "Paragraph Separator", - '\u202F' => "Narrow No-Break Space", - '\u205F' => "Medium Mathematical Space", - '\u3000' => "Ideographic Space", - _ => "Unknown" - }; - - public Task MinimalParseAsync(IFileInfo path, Cancel ctx) { var state = new ParserState(Build) @@ -159,17 +72,11 @@ public Task ParseSnippetAsync(IFileInfo path, IFileInfo parent return ParseAsync(path, context, Pipeline, ctx); } - public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) - { - DetectIrregularWhitespace(markdown, path.FullName); - return ParseMarkdownStringAsync(markdown, path, matter, Pipeline); - } + public MarkdownDocument ParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) => + ParseMarkdownStringAsync(markdown, path, matter, Pipeline); - public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) - { - DetectIrregularWhitespace(markdown, path.FullName); - return ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline); - } + public MarkdownDocument MinimalParseStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter) => + ParseMarkdownStringAsync(markdown, path, matter, MinimalPipeline); private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo path, YamlFrontMatter? matter, MarkdownPipeline pipeline) { @@ -185,7 +92,7 @@ private MarkdownDocument ParseMarkdownStringAsync(string markdown, IFileInfo pat return markdownDocument; } - private async Task ParseAsync( + private static async Task ParseAsync( IFileInfo path, MarkdownParserContext context, MarkdownPipeline pipeline, @@ -199,12 +106,7 @@ private async Task ParseAsync( inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx); } else - { inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx); - } - - // Check for irregular whitespace characters - DetectIrregularWhitespace(inputMarkdown, path.FullName); var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context); return markdownDocument; @@ -258,6 +160,7 @@ public MarkdownPipeline Pipeline .UseEnhancedCodeBlocks() .UseHtmxLinkInlineRenderer() .DisableHtml() + .UseWhiteSpaceNormalizer() .UseHardBreaks(); _ = builder.BlockParsers.TryRemove(); _pipelineCached = builder.Build(); diff --git a/tests/authoring/Framework/ErrorCollectorAssertions.fs b/tests/authoring/Framework/ErrorCollectorAssertions.fs index 9e844f67a..2557f1b33 100644 --- a/tests/authoring/Framework/ErrorCollectorAssertions.fs +++ b/tests/authoring/Framework/ErrorCollectorAssertions.fs @@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions = | Some e -> let message = e.Message test <@ message.Contains(expected) @> - | None -> failwithf "Expected errors but no errors were logged" + | None -> failwithf "Expected warnings but no warnings were logged" + + [] + let hasHint (expected: string) (actual: Lazy) = + let actual = actual.Value + actual.Context.Collector.Hints |> shouldBeGreaterThan 0 + let errorDiagnostics = actual.Context.Collector.Diagnostics + .Where(fun d -> d.Severity = Severity.Hint) + .ToArray() + |> List.ofArray + |> List.tryHead + match errorDiagnostics with + | Some e -> + let message = e.Message + test <@ message.Contains(expected) @> + | None -> failwithf "Expected hints but no hints were logged" diff --git a/tests/authoring/Framework/TestValues.fs b/tests/authoring/Framework/TestValues.fs index 549083f63..8ec8e319c 100644 --- a/tests/authoring/Framework/TestValues.fs +++ b/tests/authoring/Framework/TestValues.fs @@ -26,8 +26,10 @@ type TestDiagnosticsOutput() = match diagnostic.Severity with | Severity.Error -> output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})") - | _ -> + | Severity.Warning -> output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})") + | _ -> + output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})") | _ -> () diff --git a/tests/authoring/Inline/Comments.fs b/tests/authoring/Inline/Comments.fs index a05bb87a6..c325beb90 100644 --- a/tests/authoring/Inline/Comments.fs +++ b/tests/authoring/Inline/Comments.fs @@ -17,3 +17,4 @@ not a comment [] let ``validate HTML: commented line should not be emitted`` () = markdown |> convertsToHtml """

not a comment

""" + diff --git a/tests/authoring/Linters/WhiteSpaceNormalizers.fs b/tests/authoring/Linters/WhiteSpaceNormalizers.fs new file mode 100644 index 000000000..0f92c2bb1 --- /dev/null +++ b/tests/authoring/Linters/WhiteSpaceNormalizers.fs @@ -0,0 +1,19 @@ +module ``linters``.``white space normalizers`` + +open Xunit +open authoring + + +type ``white space detection`` () = + + static let markdown = Setup.Markdown $""" +not a{'\u000B'}space +""" + + [] + let ``validate HTML: should not contain bad space character`` () = + markdown |> convertsToHtml """

not a space

""" + + [] + let ``emits a hint when a bad space is used`` () = + markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering." diff --git a/tests/authoring/authoring.fsproj b/tests/authoring/authoring.fsproj index aa289193f..2520c7227 100644 --- a/tests/authoring/authoring.fsproj +++ b/tests/authoring/authoring.fsproj @@ -58,4 +58,8 @@ + + + + From 44abf3ae2f9501514c71885f7ae4c49f683832e6 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Mon, 19 May 2025 14:11:42 +0200 Subject: [PATCH 8/8] add license headers --- tests/authoring/Linters/WhiteSpaceNormalizers.fs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/authoring/Linters/WhiteSpaceNormalizers.fs b/tests/authoring/Linters/WhiteSpaceNormalizers.fs index 0f92c2bb1..3f8e44ddd 100644 --- a/tests/authoring/Linters/WhiteSpaceNormalizers.fs +++ b/tests/authoring/Linters/WhiteSpaceNormalizers.fs @@ -1,3 +1,7 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + module ``linters``.``white space normalizers`` open Xunit