Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pass more commonmark emphasis parsing tests #763

Merged
merged 5 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## 16.0.2
* [Improves markdown emphasis parsing.](https://github.com/fsprojects/FSharp.Formatting/pull/763)

## 16.0.1

* Custom templaying for menus
Expand Down
181 changes: 151 additions & 30 deletions src/FSharp.Formatting.Markdown/MarkdownParser.fs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ let getLinkAndTitle (StringPosition.TrimBoth (input, _n)) =

/// Succeeds when the specified character list starts with an escaped
/// character - in that case, returns the character and the tail of the list
let inline (|EscapedChar|_|) input =
let (|EscapedChar|_|) input =
match input with
| '\\' :: (('*'
| '\\'
Expand All @@ -64,22 +64,132 @@ let inline (|EscapedChar|_|) input =
| _ -> None

/// Escape dollar inside a LaTex inline math span.
let inline (|EscapedLatexInlineMathChar|_|) input =
let (|EscapedLatexInlineMathChar|_|) input =
match input with
| '\\' :: (('$') as c) :: rest -> Some(c, rest)
| _ -> None

/// Succeeds when the specified character list starts with a letter or number
let inline (|AlphaNum|_|) input =
let re = """^[a-zA-Z0-9]"""
let match' = Regex.Match(Array.ofList input |> String, re)
/// Succeeds when the specificed character list starts with non-escaped punctuation.
let (|Punctuation|_|) input =
match input with
| EscapedChar _ -> None
| _ ->
// from https://github.com/commonmark/commonmark.js/blob/master/lib/inlines.js#L38
let re =
"""^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]"""

if match'.Success then
let entity = match'.Value
let _, rest = List.splitAt entity.Length input
Some(char entity, rest)
else
None
let match' = Regex.Match(Array.ofList input |> String, re)

if match'.Success then
let entity = match'.Value
let _, rest = List.splitAt entity.Length input
Some(char entity, rest)
else
None

let (|NotPunctuation|_|) input =
match input with
| Punctuation _ -> None
| _ -> Some input

module Char =
let (|WhiteSpace|_|) input =
match input with
| [] -> Some input
| x :: _xs ->
if String.IsNullOrWhiteSpace(string x) then
Some input
else
None

let (|NotWhiteSpace|_|) input =
match input with
| WhiteSpace _ -> None
| _ -> Some input

/// Succeeds when the specificed character list starts with a delimeter run.
let (|DelimiterRun|_|) input =
match input with
| ('*'
| '_') :: _tail as (h :: t) ->
let run, rest = List.partitionWhile (fun x -> x = h) (h :: t)
Some(run, rest)
| _ -> None

/// Succeeds when there's a match to a string of * or _ that could
/// open emphasis.
let (|LeftDelimiterRun|_|) input =
match input with
// (1) Not followed by [Unicode whitespace] and
// (2a) not followed by a [Unicode punctuation character] or
// (2b) followed by a [Unicode punctuation character] and
// preceded by [Unicode whitespace] or a [Unicode punctuation character].
//
// Passes 1 and 2a.
| DelimiterRun (_, Char.NotWhiteSpace _) & DelimiterRun (run, NotPunctuation xs) -> Some([], run, xs)
| _ :: DelimiterRun (_, Char.NotWhiteSpace _) & h :: DelimiterRun (run, NotPunctuation xs) -> Some([ h ], run, xs)
// Passes 1 and 2b
| h :: DelimiterRun (run, Punctuation (x, xs)) ->
match [ h ] with
| Char.WhiteSpace _
| Punctuation _ -> Some([ h ], run, x :: xs)
| _ -> None
// Passes 1 and 2b when the run is at the start of the line.
// |CannotStartEmphasis| ensures that we don't match this
// when we've previously discarded a leading character.
| DelimiterRun (run, Punctuation (x, xs)) -> Some([], run, x :: xs)
| _ -> None

/// Succeeds when there's a match to a string of * or _ that could
/// close emphasis.
let (|RightDelimiterRun|_|) input =
match input with
// A right-flanking delimiter run is
// 1. not preceded by [Unicode whitepace]
// 2. And either
// a. not preceded by a [Unicode punctuation character], or
// b. preceded by a [Unicode punctuation character] and
// followed by [Unicode whitespace] or a [Unicode punctuation character]
//
// An escaped character followed by delimiter run matches 1 and 2a.
| EscapedChar (x, DelimiterRun (run, xs)) -> Some([ '\\'; x ], run, xs)
| EscapedChar _ -> None
| Char.NotWhiteSpace _ & x :: DelimiterRun (run, xs) ->
match input with
// 1 and 2a
| NotPunctuation _ -> Some([ x ], run, xs)
// 1 and 2b
| Punctuation (x, DelimiterRun (run, Char.WhiteSpace ys)) -> Some([ x ], run, ys)
// 1 and 2b
| Punctuation (x, DelimiterRun (run, Punctuation (y, ys))) -> Some([ x ], run, y :: ys)
| _ -> None
| _ -> None

/// Matches ['c',LeftDelimiterRun]::xs that should
/// not open emphasis. This is useful because the
/// parser iterates through characters one by one and
/// in this case we need to skip both 'c' and the LeftDelimiterRun.
/// If we only skipped 'c' then we could match LeftDelimiterRun
/// on the next iteration and we do not want that to happen.
let (|CannotOpenEmphasis|_|) input =
match input with
// Rule #2: A single `_` character [can open emphasis] iff
// it is part of a [left-flanking delimiter run]
// and either (a) not part of a [right-flanking delimiter run]
// or (b) part of a [right-flanking delimiter run]
// preceded by a [Unicode punctuation character].
| LeftDelimiterRun _ & RightDelimiterRun (pre, [ '_' ], post) ->
match List.rev pre with
| Punctuation _ -> None
| revPre -> Some('_' :: revPre, post)
// We cannot pass 1 and 2b of the left flanking rule
// when h is neither white space nor punctuation.
| h :: DelimiterRun (run, Punctuation (x, xs)) ->
match [ h ] with
| Char.WhiteSpace _
| Punctuation _ -> None
| _ -> Some(List.rev (h :: run), x :: xs)
| _ -> None

/// Matches a list if it starts with a sub-list that is delimited
/// using the specified delimiters. Returns a wrapped list and the rest.
Expand All @@ -88,20 +198,29 @@ let inline (|AlphaNum|_|) input =
let (|DelimitedMarkdown|_|) bracket input =
let _startl, endl = bracket, bracket
// Like List.partitionUntilEquals, but skip over escaped characters
let rec loop acc =
let rec loop acc count =
function
| EscapedChar (x, xs) -> loop (x :: '\\' :: acc) xs
| input when List.startsWith endl input ->
let rest = List.skip bracket.Length input

match rest with
| AlphaNum (x, xs) -> loop (x :: endl @ acc) xs
| _ -> Some(List.rev acc, input)
| x :: xs -> loop (x :: acc) xs
| (RightDelimiterRun (pre, [ '_' ], post) as input) when endl = [ '_' ] ->
match input with
| LeftDelimiterRun (pre, run, (Punctuation _ as post)) ->
if count = 0 then
Some((List.rev acc) @ pre, run @ post)
else
loop ((List.rev (pre @ run)) @ acc) (count - 1) post
| LeftDelimiterRun (pre, run, post) -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post
| _ -> Some((List.rev acc) @ pre, [ '_' ] @ post)
| RightDelimiterRun (pre, run, post) when endl = run ->
if count = 0 then
Some((List.rev acc) @ pre, run @ post)
else
loop ((List.rev (pre @ run)) @ acc) (count - 1) post
| EscapedChar (x, xs) -> loop (x :: '\\' :: acc) count xs
| LeftDelimiterRun (pre, run, post) when run = endl -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post
| x :: xs -> loop (x :: acc) count xs
| [] -> None
// If it starts with 'startl', let's search for 'endl'
if List.startsWith bracket input then
match loop [] (List.skip bracket.Length input) with
match loop [] 0 (List.skip bracket.Length input) with
| Some (pre, post) -> Some(pre, List.skip bracket.Length post)
| None -> None
else
Expand Down Expand Up @@ -183,16 +302,15 @@ let (|AutoLink|_|) input =
/// TODO: This does not handle nested emphasis well.
let (|Emphasised|_|) =
function
| (('_'
| '*') :: _tail) as input ->
match input with
| LeftDelimiterRun (pre, run, post) ->
match run @ post with
| DelimitedMarkdown [ '_'; '_'; '_' ] (body, rest)
| DelimitedMarkdown [ '*'; '*'; '*' ] (body, rest) ->
Some(body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest)
Some(pre, body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest)
| DelimitedMarkdown [ '_'; '_' ] (body, rest)
| DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(body, Strong, rest)
| DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(pre, body, Strong, rest)
| DelimitedMarkdown [ '_' ] (body, rest)
| DelimitedMarkdown [ '*' ] (body, rest) -> Some(body, Emphasis, rest)
| DelimitedMarkdown [ '*' ] (body, rest) -> Some(pre, body, Emphasis, rest)
| _ -> None
| _ -> None

Expand Down Expand Up @@ -392,13 +510,16 @@ let rec parseChars acc input (ctx: ParsingContext) =
yield IndirectImage(String(Array.ofList body), original, key, ctx.CurrentRange)
yield! parseChars [] rest ctx

// Handle emphasised text
| Emphasised (body, f, rest) ->
// Handle Emphasis
| CannotOpenEmphasis (revPre, post) -> yield! parseChars (revPre @ acc) post ctx
| Emphasised (pre, body, f, rest) ->
let (value, ctx) = accLiterals.Value
yield! value
yield! parseChars [] pre ctx
let body = parseChars [] body ctx |> List.ofSeq
yield f (body, ctx.CurrentRange)
yield! parseChars [] rest ctx

// Encode '<' char if it is not link or inline HTML
| '<' :: rest -> yield! parseChars (';' :: 't' :: 'l' :: '&' :: acc) rest ctx
| '>' :: rest -> yield! parseChars (';' :: 't' :: 'g' :: '&' :: acc) rest ctx
Expand Down
106 changes: 106 additions & 0 deletions tests/FSharp.Markdown.Tests/Markdown.fs
Original file line number Diff line number Diff line change
Expand Up @@ -877,3 +877,109 @@ let ``Underscore inside italic and bold near punctuation is preserved`` () =
|> properNewLines

Markdown.ToHtml doc |> shouldEqual expected

[<Test>]
let ``emphasis with space`` () =
let doc = "*foo bar*"
let actual = "<p><em>foo bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening * is followed by whitespace`` () =
let doc = "a * foo bar*"
let actual = "<p>a * foo bar*</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening * is preceded by alphanumeric and followed by punctuation`` () =
let doc = """a*"foo"*"""
let actual = """<p>a*"foo"*</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Intraword emphasis with * is permitted`` () =
let doc = "foo*bar*"
let actual = "<p>foo<em>bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "5*6*78"
let actual2 = "<p>5<em>6</em>78</p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2

[<Test>]
let ``emphasis using _ with space`` () =
let doc = "_foo bar_"
let actual = "<p><em>foo bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening _ is followed by whitespace`` () =
let doc = "_ foo bar_"
let actual = "<p>_ foo bar_</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening _ is preceded by alphanumeric and followed by punctuation`` () =
let doc = """a_"foo"_"""
let actual = """<p>a_"foo"_</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Intraword emphasis with _ is not permitted`` () =
let doc = "foo_bar_"
let actual = "<p>foo_bar_</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "5_6_78"
let actual2 = "<p>5_6_78</p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2

let doc3 = "пристаням_стремятся_"
let actual3 = "<p>пристаням_стремятся_</p>\r\n" |> properNewLines
Markdown.ToHtml doc3 |> shouldEqual actual3

[<Test>]
let ``No emphasis if first _ is right flanking and second is left flanking`` () =
let doc = """aa_"bb"_cc"""
let actual = """<p>aa_"bb"_cc</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Emphasis if first _ is left and right flanking and preceded by punctuation`` () =
let doc = "foo-_(bar)_"
let actual = "<p>foo-<em>(bar)</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if open and close delim do not match`` () =
let doc = "_foo*"
let actual = "<p>_foo*</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if closing * is preceded by whitespace`` () =
let doc1 = "*foo bar *"
let actual1 = "<p>*foo bar *</p>\r\n" |> properNewLines
Markdown.ToHtml doc1 |> shouldEqual actual1

let doc2 =
"*foo bar
*"

let actual2 =
"<p>*foo bar
*</p>"
+ "\r\n"
|> properNewLines

Markdown.ToHtml doc2 |> shouldEqual actual2

[<Test>]
let ``Do not close emphasis if second * is preceded by punctuation and followed by alphanumeric`` () =
let doc = "*(*foo)"
let actual = "<p>*(*foo)</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "*(*foo*)*"
let actual2 = "<p><em>(<em>foo</em>)</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2