Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions lib/markbridge.rb
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,19 @@ def build_renderer(tag_library:)
Renderers::Discourse::Renderer.new(tag_library:, escaper:)
end

# Trailing-invisibles set, applied only when opted into via
# `Markbridge.configuration.strip_trailing_invisibles = true`.
# NBSP (U+00A0) plus the zero-width format chars that render as
# nothing — ZWSP U+200B, ZWNJ U+200C, ZWJ U+200D, WJ U+2060,
# ZWNBSP/BOM U+FEFF. Deliberately excludes ASCII space and tab so
# Markdown's "two trailing spaces = hard line break" rule still
# works. The `$` anchors to end-of-line (default Ruby regex mode),
# so this strips per line without consuming the line break.
TRAILING_INVISIBLE_RE = /[\u00A0\u200B\u200C\u200D\u2060\uFEFF]+$/
private_constant :TRAILING_INVISIBLE_RE

def cleanup_markdown(text)
text = text.gsub(TRAILING_INVISIBLE_RE, "") if configuration.strip_trailing_invisibles
text
.gsub(/\n{3,}/, "\n\n") # Max 2 consecutive newlines
.gsub(/^[ \t]+$/, "") # Remove whitespace-only lines
Expand Down
10 changes: 9 additions & 1 deletion lib/markbridge/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@

module Markbridge
class Configuration
attr_accessor :escape_hard_line_breaks
attr_accessor :escape_hard_line_breaks, :strip_trailing_invisibles

def initialize
@escape_hard_line_breaks = false
# When true, `cleanup_markdown` rstrips a small set of invisible
# characters (NBSP, ZWSP, ZWNJ, ZWJ, WJ, ZWNBSP/BOM) at each line
# end. Useful for cleaning Outlook/Word HTML exports where these
# show up as soft-break hints and spacer-paragraph fillers. Adds
# one regex pass over the rendered output (~4-5% slowdown on a
# mixed-content benchmark), so default off; opt in if the polish
# matters more than throughput.
@strip_trailing_invisibles = false
end
end
end
38 changes: 38 additions & 0 deletions spec/markbridge_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,44 @@ def render(_element, _interface, **_kwargs)
it "strips leading and trailing whitespace from the final output" do
expect(described_class.bbcode_to_markdown(" hi ")).to eq("hi")
end

it "preserves trailing invisible characters by default (config flag off)" do
# Default config keeps the output as-is — no extra regex pass on
# cleanup. The U+200B byte sequence (e2 80 8b in UTF-8) survives
# at the end of the rendered output.
result = described_class.html_to_markdown("<p>Hello&#8203;</p>")

expect(result.bytes.last(4)).to eq([0x6f, 0xe2, 0x80, 0x8b]) # "o" + ZWSP
end

context "with strip_trailing_invisibles enabled" do
before { described_class.configure { |c| c.strip_trailing_invisibles = true } }

it "strips a trailing zero-width space at the end of a paragraph" do
# Outlook-style soft-break ZWSP after content.
expect(described_class.html_to_markdown("<p>Hello&#8203;</p>")).to eq("Hello")
end

it "drops nbsp-only spacer paragraphs by stripping their trailing nbsp" do
# `<p>&nbsp;</p>` collapses to "<nbsp>\n\n"; rstripping the nbsp
# leaves an empty line which the existing \n{3,} collapse drops.
expect(described_class.html_to_markdown("<p>a</p><p>&nbsp;</p><p>b</p>")).to eq("a\n\nb")
end

it "preserves trailing ASCII spaces — they are the Markdown hard-break syntax" do
# `hello \nworld` (two trailing spaces) is the hard-line-break form;
# the trailing-invisibles strip must not touch ASCII spaces.
expect(described_class.bbcode_to_markdown("hello \nworld")).to eq("hello \nworld")
end

it "strips trailing invisibles on every affected line, not just the first" do
# gsub vs sub: with sub, only the first paragraph would get cleaned
# and the second's ZWSP would leak through.
result = described_class.html_to_markdown("<p>first&#8203;</p><p>second&#8203;</p>")

expect(result).to eq("first\n\nsecond")
end
end
end

describe ".parse_html" do
Expand Down
39 changes: 39 additions & 0 deletions spec/system/html_to_markdown_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,45 @@
end
end

describe "trailing invisible characters with strip_trailing_invisibles" do
around do |example|
Markbridge.configure { |c| c.strip_trailing_invisibles = true }
example.run
ensure
Markbridge.reset_defaults!
end

it "strips trailing zero-width space at the end of a paragraph" do
html = "<p>Hello Specialist&#8203;</p><p>Our customer are unhappy</p>"

result = Markbridge.html_to_markdown(html)
expect(result).to eq("Hello Specialist\n\nOur customer are unhappy")
end

it "drops an Outlook-style nbsp-only spacer paragraph between content" do
html = '<p>before</p><p class="MsoNormal">&nbsp;</p><p>after</p>'

result = Markbridge.html_to_markdown(html)
expect(result).to eq("before\n\nafter")
end

it "preserves leading nbsp (author intent — used as indentation)" do
html = "<p>&nbsp;Hello</p>"

result = Markbridge.html_to_markdown(html)
expect(result).to eq(" Hello")
end

it "preserves invisibles in the middle of content" do
# Mid-content ZWSP is a meaningful soft-break hint (long URLs, CJK),
# only line-end invisibles get stripped.
html = "<p>before​inline​text</p>"

result = Markbridge.html_to_markdown(html)
expect(result).to eq("before​inline​text")
end
end

describe "complex combinations" do
it "converts mixed content" do
html = <<~HTML
Expand Down
11 changes: 11 additions & 0 deletions spec/unit/markbridge/configuration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,15 @@
expect(configuration.escape_hard_line_breaks).to be true
end
end

describe "#strip_trailing_invisibles" do
it "defaults to false" do
expect(configuration.strip_trailing_invisibles).to be false
end

it "can be set to true" do
configuration.strip_trailing_invisibles = true
expect(configuration.strip_trailing_invisibles).to be true
end
end
end