diff --git a/docs/cli/docset/format.md b/docs/cli/docset/format.md index b5a88154b..ba20a1fe9 100644 --- a/docs/cli/docset/format.md +++ b/docs/cli/docset/format.md @@ -32,15 +32,46 @@ Currently, it handles irregular space characters that may impair Markdown render ### Irregular Space Detection -The format command detects and replaces 24 types of irregular space characters with regular spaces, including: +The format command intelligently handles irregular space characters by categorizing them into three groups: -- No-Break Space (U+00A0) -- En Space (U+2002) -- Em Space (U+2003) +#### Characters removed entirely + +These characters are removed completely as they serve no visual purpose and can cause rendering issues: + +- Line Tabulation (U+000B) +- Form Feed (U+000C) +- Next Line (U+0085) +- Ogham Space Mark (U+1680) +- Mongolian Vowel Separator (U+180E) +- Zero Width No-Break Space/BOM (U+FEFF) - Zero Width Space (U+200B) - Line Separator (U+2028) - Paragraph Separator (U+2029) -- And 18 other irregular space variants + +#### Characters preserved + +These characters are preserved as they serve important typographic or functional purposes: + +- No-Break Space (U+00A0) - Prevents line breaks +- Figure Space (U+2007) - Aligns numbers in tables +- Narrow No-Break Space (U+202F) - French typography +- Medium Mathematical Space (U+205F) - Mathematical expressions + +#### Characters replaced with regular spaces + +These characters are replaced with standard spaces (U+0020) as they can cause inconsistent rendering: + +- En Quad (U+2000) +- Em Quad (U+2001) +- En Space (U+2002) +- Em Space (U+2003) +- Tree-Per-Em (U+2004) +- Four-Per-Em (U+2005) +- Six-Per-Em (U+2006) +- Punctuation Space (U+2008) +- Thin Space (U+2009) +- Hair Space (U+200A) +- Ideographic Space (U+3000) These characters can cause unexpected rendering issues in Markdown and are often introduced accidentally through copy-paste operations from other applications. diff --git a/src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs b/src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs index 5a842a2dc..4c4a8b6a7 100644 --- a/src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs +++ b/src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information using System.Buffers; +using System.Linq; using Elastic.Markdown.Diagnostics; using Markdig; using Markdig.Helpers; @@ -35,16 +36,23 @@ public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) => public class SpaceNormalizerParser : InlineParser { - // Collection of irregular space characters that may impair Markdown rendering - private static readonly char[] IrregularSpaceChars = + // Characters that should be removed entirely (invisible/problematic) + private static readonly char[] CharactersToRemove = [ '\u000B', // Line Tabulation (\v) - '\u000C', // Form Feed (\f) - - '\u00A0', // No-Break Space - '\u0085', // Next Line '\u1680', // Ogham Space Mark '\u180E', // Mongolian Vowel Separator - '\ufeff', // Zero Width No-Break Space - + '\u200B', // Zero Width Space - + '\u2028', // Line Separator + '\u2029' // Paragraph Separator + ]; + + // Characters to replace with regular spaces (visible but problematic) + private static readonly char[] CharactersToReplace = + [ '\u2000', // En Quad '\u2001', // Em Quad '\u2002', // En Space - @@ -52,23 +60,20 @@ public class SpaceNormalizerParser : InlineParser '\u2004', // Tree-Per-Em '\u2005', // Four-Per-Em '\u2006', // Six-Per-Em - '\u2007', // Figure Space '\u2008', // Punctuation Space - '\u2009', // Thin Space '\u200A', // Hair Space - '\u200B', // Zero Width Space - - '\u2028', // Line Separator - '\u2029', // Paragraph Separator - '\u202F', // Narrow No-Break Space - '\u205F', // Medium Mathematical Space '\u3000' // Ideographic Space ]; - private static readonly SearchValues SpaceSearchValues = SearchValues.Create(IrregularSpaceChars); + + // Combined list of characters that need fixing (removed or replaced) + private static readonly char[] CharactersToFix = CharactersToRemove.Concat(CharactersToReplace).ToArray(); + private static readonly SearchValues SpaceSearchValues = SearchValues.Create(CharactersToFix); // Track which files have already had the hint emitted to avoid duplicates private static readonly HashSet FilesWithHintEmitted = []; - public SpaceNormalizerParser() => OpeningCharacters = IrregularSpaceChars; + public SpaceNormalizerParser() => OpeningCharacters = CharactersToFix; public override bool Match(InlineProcessor processor, ref StringSlice slice) { diff --git a/src/authoring/Elastic.Documentation.Refactor/Formatters/IrregularSpaceFormatter.cs b/src/authoring/Elastic.Documentation.Refactor/Formatters/IrregularSpaceFormatter.cs index 6d4212e55..1bb00d6c6 100644 --- a/src/authoring/Elastic.Documentation.Refactor/Formatters/IrregularSpaceFormatter.cs +++ b/src/authoring/Elastic.Documentation.Refactor/Formatters/IrregularSpaceFormatter.cs @@ -8,22 +8,41 @@ namespace Elastic.Documentation.Refactor.Formatters; /// -/// Formatter that replaces irregular space characters with regular spaces +/// Formatter that handles irregular space characters appropriately: +/// - Removes invisible characters entirely +/// - Preserves semantically meaningful spaces +/// - Replaces problematic spaces with regular spaces /// public class IrregularSpaceFormatter : IFormatter { public string Name => "irregular space"; - // Collection of irregular space characters that may impair Markdown rendering - private static readonly char[] IrregularSpaceChars = + // Characters to remove entirely (invisible/problematic) + private static readonly char[] CharactersToRemove = [ '\u000B', // Line Tabulation (\v) - '\u000C', // Form Feed (\f) - - '\u00A0', // No-Break Space - '\u0085', // Next Line '\u1680', // Ogham Space Mark '\u180E', // Mongolian Vowel Separator - '\ufeff', // Zero Width No-Break Space - + '\u200B', // Zero Width Space - + '\u2028', // Line Separator + '\u2029' // Paragraph Separator + ]; + + // Characters to preserve (semantically meaningful) + private static readonly char[] CharactersToPreserve = + [ + '\u00A0', // No-Break Space - + '\u2007', // Figure Space + '\u202F', // Narrow No-Break Space + '\u205F' // Medium Mathematical Space + ]; + + // Characters to replace with regular spaces (visible but problematic) + private static readonly char[] CharactersToReplace = + [ '\u2000', // En Quad '\u2001', // Em Quad '\u2002', // En Space - @@ -31,39 +50,50 @@ public class IrregularSpaceFormatter : IFormatter '\u2004', // Tree-Per-Em '\u2005', // Four-Per-Em '\u2006', // Six-Per-Em - '\u2007', // Figure Space '\u2008', // Punctuation Space - '\u2009', // Thin Space '\u200A', // Hair Space - '\u200B', // Zero Width Space - - '\u2028', // Line Separator - '\u2029', // Paragraph Separator - '\u202F', // Narrow No-Break Space - '\u205F', // Medium Mathematical Space '\u3000' // Ideographic Space ]; - private static readonly SearchValues IrregularSpaceSearchValues = SearchValues.Create(IrregularSpaceChars); + private static readonly SearchValues CharactersToRemoveValues = SearchValues.Create(CharactersToRemove); + private static readonly SearchValues CharactersToPreserveValues = SearchValues.Create(CharactersToPreserve); + private static readonly SearchValues CharactersToReplaceValues = SearchValues.Create(CharactersToReplace); public FormatResult Format(string content) { - // Quick check - if no irregular space, return original - if (content.AsSpan().IndexOfAny(IrregularSpaceSearchValues) == -1) + // Quick check - if no irregular space characters, return original + var span = content.AsSpan(); + if (span.IndexOfAny(CharactersToRemoveValues) == -1 && + span.IndexOfAny(CharactersToPreserveValues) == -1 && + span.IndexOfAny(CharactersToReplaceValues) == -1) return new FormatResult(content, 0); - // Replace irregular space with regular spaces + // Process each character with appropriate handling var sb = new StringBuilder(content.Length); var replacements = 0; foreach (var c in content) { - if (IrregularSpaceSearchValues.Contains(c)) + if (CharactersToRemoveValues.Contains(c)) + { + // Remove invisible/problematic characters entirely + replacements++; + } + else if (CharactersToPreserveValues.Contains(c)) + { + // Preserve semantically meaningful characters + _ = sb.Append(c); + } + else if (CharactersToReplaceValues.Contains(c)) { + // Replace problematic visible characters with regular spaces _ = sb.Append(' '); replacements++; } else { + // Keep regular characters as-is _ = sb.Append(c); } }