Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 36 additions & 5 deletions docs/cli/docset/format.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,46 @@ Currently, it handles irregular space characters that may impair Markdown render

### Irregular Space Detection

The format command detects and replaces 24 types of irregular space characters with regular spaces, including:
The format command intelligently handles irregular space characters by categorizing them into three groups:

- No-Break Space (U+00A0)
- En Space (U+2002)
- Em Space (U+2003)
#### Characters removed entirely

These characters are removed completely as they serve no visual purpose and can cause rendering issues:

- Line Tabulation (U+000B)
- Form Feed (U+000C)
- Next Line (U+0085)
- Ogham Space Mark (U+1680)
- Mongolian Vowel Separator (U+180E)
- Zero Width No-Break Space/BOM (U+FEFF)
- Zero Width Space (U+200B)
- Line Separator (U+2028)
- Paragraph Separator (U+2029)
- And 18 other irregular space variants

#### Characters preserved

These characters are preserved as they serve important typographic or functional purposes:

- No-Break Space (U+00A0) - Prevents line breaks
- Figure Space (U+2007) - Aligns numbers in tables
- Narrow No-Break Space (U+202F) - French typography
- Medium Mathematical Space (U+205F) - Mathematical expressions

#### Characters replaced with regular spaces

These characters are replaced with standard spaces (U+0020) as they can cause inconsistent rendering:

- En Quad (U+2000)
- Em Quad (U+2001)
- En Space (U+2002)
- Em Space (U+2003)
- Tree-Per-Em (U+2004)
- Four-Per-Em (U+2005)
- Six-Per-Em (U+2006)
- Punctuation Space (U+2008)
- Thin Space (U+2009)
- Hair Space (U+200A)
- Ideographic Space (U+3000)

These characters can cause unexpected rendering issues in Markdown and are often introduced accidentally through copy-paste operations from other applications.

Expand Down
27 changes: 16 additions & 11 deletions src/Elastic.Markdown/Myst/Linters/SpaceNormalizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information

using System.Buffers;
using System.Linq;
using Elastic.Markdown.Diagnostics;
using Markdig;
using Markdig.Helpers;
Expand Down Expand Up @@ -35,40 +36,44 @@ public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>

public class SpaceNormalizerParser : InlineParser
{
// Collection of irregular space characters that may impair Markdown rendering
private static readonly char[] IrregularSpaceChars =
// Characters that should be removed entirely (invisible/problematic)
private static readonly char[] CharactersToRemove =
[
'\u000B', // Line Tabulation (\v) - <VT>
'\u000C', // Form Feed (\f) - <FF>
'\u00A0', // No-Break Space - <NBSP>
'\u0085', // Next Line
'\u1680', // Ogham Space Mark
'\u180E', // Mongolian Vowel Separator - <MVS>
'\ufeff', // Zero Width No-Break Space - <BOM>
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029' // Paragraph Separator
];

// Characters to replace with regular spaces (visible but problematic)
private static readonly char[] CharactersToReplace =
[
'\u2000', // En Quad
'\u2001', // Em Quad
'\u2002', // En Space - <ENSP>
'\u2003', // Em Space - <EMSP>
'\u2004', // Tree-Per-Em
'\u2005', // Four-Per-Em
'\u2006', // Six-Per-Em
'\u2007', // Figure Space
'\u2008', // Punctuation Space - <PUNCSP>
'\u2009', // Thin Space
'\u200A', // Hair Space
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029', // Paragraph Separator
'\u202F', // Narrow No-Break Space
'\u205F', // Medium Mathematical Space
'\u3000' // Ideographic Space
];
private static readonly SearchValues<char> SpaceSearchValues = SearchValues.Create(IrregularSpaceChars);

// Combined list of characters that need fixing (removed or replaced)
private static readonly char[] CharactersToFix = CharactersToRemove.Concat(CharactersToReplace).ToArray();
private static readonly SearchValues<char> SpaceSearchValues = SearchValues.Create(CharactersToFix);

// Track which files have already had the hint emitted to avoid duplicates
private static readonly HashSet<string> FilesWithHintEmitted = [];

public SpaceNormalizerParser() => OpeningCharacters = IrregularSpaceChars;
public SpaceNormalizerParser() => OpeningCharacters = CharactersToFix;

public override bool Match(InlineProcessor processor, ref StringSlice slice)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,62 +8,92 @@
namespace Elastic.Documentation.Refactor.Formatters;

/// <summary>
/// Formatter that replaces irregular space characters with regular spaces
/// Formatter that handles irregular space characters appropriately:
/// - Removes invisible characters entirely
/// - Preserves semantically meaningful spaces
/// - Replaces problematic spaces with regular spaces
/// </summary>
public class IrregularSpaceFormatter : IFormatter
{
public string Name => "irregular space";

// Collection of irregular space characters that may impair Markdown rendering
private static readonly char[] IrregularSpaceChars =
// Characters to remove entirely (invisible/problematic)
private static readonly char[] CharactersToRemove =
[
'\u000B', // Line Tabulation (\v) - <VT>
'\u000C', // Form Feed (\f) - <FF>
'\u00A0', // No-Break Space - <NBSP>
'\u0085', // Next Line
'\u1680', // Ogham Space Mark
'\u180E', // Mongolian Vowel Separator - <MVS>
'\ufeff', // Zero Width No-Break Space - <BOM>
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029' // Paragraph Separator
];

// Characters to preserve (semantically meaningful)
private static readonly char[] CharactersToPreserve =
[
'\u00A0', // No-Break Space - <NBSP>
'\u2007', // Figure Space
'\u202F', // Narrow No-Break Space
'\u205F' // Medium Mathematical Space
];

// Characters to replace with regular spaces (visible but problematic)
private static readonly char[] CharactersToReplace =
[
'\u2000', // En Quad
'\u2001', // Em Quad
'\u2002', // En Space - <ENSP>
'\u2003', // Em Space - <EMSP>
'\u2004', // Tree-Per-Em
'\u2005', // Four-Per-Em
'\u2006', // Six-Per-Em
'\u2007', // Figure Space
'\u2008', // Punctuation Space - <PUNCSP>
'\u2009', // Thin Space
'\u200A', // Hair Space
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029', // Paragraph Separator
'\u202F', // Narrow No-Break Space
'\u205F', // Medium Mathematical Space
'\u3000' // Ideographic Space
];

private static readonly SearchValues<char> IrregularSpaceSearchValues = SearchValues.Create(IrregularSpaceChars);
private static readonly SearchValues<char> CharactersToRemoveValues = SearchValues.Create(CharactersToRemove);
private static readonly SearchValues<char> CharactersToPreserveValues = SearchValues.Create(CharactersToPreserve);
private static readonly SearchValues<char> CharactersToReplaceValues = SearchValues.Create(CharactersToReplace);

public FormatResult Format(string content)
{
// Quick check - if no irregular space, return original
if (content.AsSpan().IndexOfAny(IrregularSpaceSearchValues) == -1)
// Quick check - if no irregular space characters, return original
var span = content.AsSpan();
if (span.IndexOfAny(CharactersToRemoveValues) == -1 &&
span.IndexOfAny(CharactersToPreserveValues) == -1 &&
span.IndexOfAny(CharactersToReplaceValues) == -1)
return new FormatResult(content, 0);

// Replace irregular space with regular spaces
// Process each character with appropriate handling
var sb = new StringBuilder(content.Length);
var replacements = 0;

foreach (var c in content)
{
if (IrregularSpaceSearchValues.Contains(c))
if (CharactersToRemoveValues.Contains(c))
{
// Remove invisible/problematic characters entirely
replacements++;
}
else if (CharactersToPreserveValues.Contains(c))
{
// Preserve semantically meaningful characters
_ = sb.Append(c);
}
else if (CharactersToReplaceValues.Contains(c))
{
// Replace problematic visible characters with regular spaces
_ = sb.Append(' ');
replacements++;
}
else
{
// Keep regular characters as-is
_ = sb.Append(c);
}
}
Expand Down
Loading