diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py index 98ce7aa2..64221495 100644 --- a/markdown_it/parser_core.py +++ b/markdown_it/parser_core.py @@ -8,16 +8,16 @@ from .ruler import Ruler from .rules_core.state_core import StateCore -from .rules_core import normalize, block, inline, replace +from .rules_core import normalize, block, inline, replace, smartquotes -# TODO linkify, replacements, smartquotes +# TODO linkify _rules = [ ["normalize", normalize], ["block", block], ["inline", inline], # [ 'linkify', require('./rules_core/linkify') ], ["replacements", replace], - # [ 'smartquotes', require('./rules_core/smartquotes') ] + ["smartquotes", smartquotes], ] diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py index 72757f51..401e527d 100644 --- a/markdown_it/rules_core/__init__.py +++ b/markdown_it/rules_core/__init__.py @@ -3,3 +3,4 @@ from .block import block # noqa: F401 from .inline import inline # noqa: F401 from .replacements import replace # noqa: F401 +from .smartquotes import smartquotes # noqa: F401 diff --git a/markdown_it/rules_core/smartquotes.js b/markdown_it/rules_core/smartquotes.js deleted file mode 100644 index 155e7a61..00000000 --- a/markdown_it/rules_core/smartquotes.js +++ /dev/null @@ -1,201 +0,0 @@ -// Convert straight quotation marks to typographic ones -// -'use strict'; - - -var isWhiteSpace = require('../common/utils').isWhiteSpace; -var isPunctChar = require('../common/utils').isPunctChar; -var isMdAsciiPunct = require('../common/utils').isMdAsciiPunct; - -var QUOTE_TEST_RE = /['"]/; -var QUOTE_RE = /['"]/g; -var APOSTROPHE = '\u2019'; /* ’ */ - - -function replaceAt(str, index, ch) { - return str.substr(0, index) + ch + str.substr(index + 1); -} - -function process_inlines(tokens, state) { - var i, token, text, t, pos, max, thisLevel, item, lastChar, nextChar, - isLastPunctChar, isNextPunctChar, isLastWhiteSpace, isNextWhiteSpace, - canOpen, canClose, j, isSingle, stack, openQuote, closeQuote; - - stack = []; - - for (i = 0; i < tokens.length; i++) { - token = tokens[i]; - - thisLevel = tokens[i].level; - - for (j = stack.length - 1; j >= 0; j--) { - if (stack[j].level <= thisLevel) { break; } - } - stack.length = j + 1; - - if (token.type !== 'text') { continue; } - - text = token.content; - pos = 0; - max = text.length; - - /*eslint no-labels:0,block-scoped-var:0*/ - OUTER: - while (pos < max) { - QUOTE_RE.lastIndex = pos; - t = QUOTE_RE.exec(text); - if (!t) { break; } - - canOpen = canClose = true; - pos = t.index + 1; - isSingle = (t[0] === "'"); - - // Find previous character, - // default to space if it's the beginning of the line - // - lastChar = 0x20; - - if (t.index - 1 >= 0) { - lastChar = text.charCodeAt(t.index - 1); - } else { - for (j = i - 1; j >= 0; j--) { - if (tokens[j].type === 'softbreak' || tokens[j].type === 'hardbreak') break; // lastChar defaults to 0x20 - if (tokens[j].type !== 'text') continue; - - lastChar = tokens[j].content.charCodeAt(tokens[j].content.length - 1); - break; - } - } - - // Find next character, - // default to space if it's the end of the line - // - nextChar = 0x20; - - if (pos < max) { - nextChar = text.charCodeAt(pos); - } else { - for (j = i + 1; j < tokens.length; j++) { - if (tokens[j].type === 'softbreak' || tokens[j].type === 'hardbreak') break; // nextChar defaults to 0x20 - if (tokens[j].type !== 'text') continue; - - nextChar = tokens[j].content.charCodeAt(0); - break; - } - } - - isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar)); - isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar)); - - isLastWhiteSpace = isWhiteSpace(lastChar); - isNextWhiteSpace = isWhiteSpace(nextChar); - - if (isNextWhiteSpace) { - canOpen = false; - } else if (isNextPunctChar) { - if (!(isLastWhiteSpace || isLastPunctChar)) { - canOpen = false; - } - } - - if (isLastWhiteSpace) { - canClose = false; - } else if (isLastPunctChar) { - if (!(isNextWhiteSpace || isNextPunctChar)) { - canClose = false; - } - } - - if (nextChar === 0x22 /* " */ && t[0] === '"') { - if (lastChar >= 0x30 /* 0 */ && lastChar <= 0x39 /* 9 */) { - // special case: 1"" - count first quote as an inch - canClose = canOpen = false; - } - } - - if (canOpen && canClose) { - // Replace quotes in the middle of punctuation sequence, but not - // in the middle of the words, i.e.: - // - // 1. foo " bar " baz - not replaced - // 2. foo-"-bar-"-baz - replaced - // 3. foo"bar"baz - not replaced - // - canOpen = isLastPunctChar; - canClose = isNextPunctChar; - } - - if (!canOpen && !canClose) { - // middle of word - if (isSingle) { - token.content = replaceAt(token.content, t.index, APOSTROPHE); - } - continue; - } - - if (canClose) { - // this could be a closing quote, rewind the stack to get a match - for (j = stack.length - 1; j >= 0; j--) { - item = stack[j]; - if (stack[j].level < thisLevel) { break; } - if (item.single === isSingle && stack[j].level === thisLevel) { - item = stack[j]; - - if (isSingle) { - openQuote = state.md.options.quotes[2]; - closeQuote = state.md.options.quotes[3]; - } else { - openQuote = state.md.options.quotes[0]; - closeQuote = state.md.options.quotes[1]; - } - - // replace token.content *before* tokens[item.token].content, - // because, if they are pointing at the same token, replaceAt - // could mess up indices when quote length != 1 - token.content = replaceAt(token.content, t.index, closeQuote); - tokens[item.token].content = replaceAt( - tokens[item.token].content, item.pos, openQuote); - - pos += closeQuote.length - 1; - if (item.token === i) { pos += openQuote.length - 1; } - - text = token.content; - max = text.length; - - stack.length = j; - continue OUTER; - } - } - } - - if (canOpen) { - stack.push({ - token: i, - pos: t.index, - single: isSingle, - level: thisLevel - }); - } else if (canClose && isSingle) { - token.content = replaceAt(token.content, t.index, APOSTROPHE); - } - } - } -} - - -module.exports = function smartquotes(state) { - /*eslint max-depth:0*/ - var blkIdx; - - if (!state.md.options.typographer) { return; } - - for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) { - - if (state.tokens[blkIdx].type !== 'inline' || - !QUOTE_TEST_RE.test(state.tokens[blkIdx].content)) { - continue; - } - - process_inlines(state.tokens[blkIdx].children, state); - } -}; diff --git a/markdown_it/rules_core/smartquotes.py b/markdown_it/rules_core/smartquotes.py new file mode 100644 index 00000000..cceb1778 --- /dev/null +++ b/markdown_it/rules_core/smartquotes.py @@ -0,0 +1,206 @@ +"""Convert straight quotation marks to typographic ones +""" +import re +from typing import List + +from .state_core import StateCore +from ..common.utils import charCodeAt +from ..common.utils import isWhiteSpace, isPunctChar, isMdAsciiPunct +from ..token import Token + + +QUOTE_TEST_RE = re.compile(r"['\"]") +QUOTE_RE = re.compile(r"['\"]") +APOSTROPHE = "\u2019" # ’ + + +def replaceAt(string: str, index: int, ch: str): + # When the index is negative, the behavior is different from the js version. + # But basically, the index will not be negative. + assert index >= 0 + return string[:index] + ch + string[index + 1 :] + + +def process_inlines(tokens: List[Token], state: StateCore): + stack = [] + + for i in range(len(tokens)): + token = tokens[i] + + thisLevel = token.level + + j = 0 + for j in range(len(stack))[::-1]: + if stack[j]["level"] <= thisLevel: + break + else: + # When the loop is terminated without a "break". + # Subtract 1 to get the same index as the js version. + j -= 1 + + stack = stack[: j + 1] + + if token.type != "text": + continue + + text = token.content + pos = 0 + maximum = len(text) + + while pos < maximum: + goto_outer = False + lastIndex = pos + t = QUOTE_RE.search(text[lastIndex:]) + if not t: + break + + canOpen = canClose = True + pos = t.start(0) + lastIndex + 1 + isSingle = t.group(0) == "'" + + # Find previous character, + # default to space if it's the beginning of the line + lastChar = 0x20 + + if t.start(0) + lastIndex - 1 >= 0: + lastChar = charCodeAt(text, t.start(0) + lastIndex - 1) + else: + for j in range(i)[::-1]: + # lastChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1) + break + + # Find next character, + # default to space if it's the end of the line + nextChar = 0x20 + + if pos < maximum: + nextChar = charCodeAt(text, pos) + else: + for j in range(i + 1, len(tokens)): + # nextChar defaults to 0x20 + if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak": + break + # should skip all tokens except 'text', 'html_inline' or 'code_inline' + if not tokens[j].content: + continue + + nextChar = charCodeAt(tokens[j].content, 0) + break + + isLastPunctChar = isMdAsciiPunct(chr(lastChar)) or isPunctChar( + chr(lastChar) + ) + isNextPunctChar = isMdAsciiPunct(chr(nextChar)) or isPunctChar( + chr(nextChar) + ) + + isLastWhiteSpace = isWhiteSpace(lastChar) + isNextWhiteSpace = isWhiteSpace(nextChar) + + if isNextWhiteSpace: + canOpen = False + elif isNextPunctChar: + if not (isLastWhiteSpace or isLastPunctChar): + canOpen = False + + if isLastWhiteSpace: + canClose = False + elif isLastPunctChar: + if not (isNextWhiteSpace or isNextPunctChar): + canClose = False + + if nextChar == 0x22 and t.group(0) == '"': # 0x22: " + if lastChar >= 0x30 and lastChar <= 0x39: # 0x30: 0, 0x39: 9 + # special case: 1"" - count first quote as an inch + canClose = canOpen = False + + if canOpen and canClose: + # Replace quotes in the middle of punctuation sequence, but not + # in the middle of the words, i.e.: + # + # 1. foo " bar " baz - not replaced + # 2. foo-"-bar-"-baz - replaced + # 3. foo"bar"baz - not replaced + canOpen = isLastPunctChar + canClose = isNextPunctChar + + if not canOpen and not canClose: + # middle of word + if isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + continue + + if canClose: + # this could be a closing quote, rewind the stack to get a match + for j in range(len(stack))[::-1]: + item = stack[j] + if stack[j]["level"] < thisLevel: + break + if item["single"] == isSingle and stack[j]["level"] == thisLevel: + item = stack[j] + + if isSingle: + openQuote = state.md.options.quotes[2] + closeQuote = state.md.options.quotes[3] + else: + openQuote = state.md.options.quotes[0] + closeQuote = state.md.options.quotes[1] + + # replace token.content *before* tokens[item.token].content, + # because, if they are pointing at the same token, replaceAt + # could mess up indices when quote length != 1 + token.content = replaceAt( + token.content, t.start(0) + lastIndex, closeQuote + ) + tokens[item["token"]].content = replaceAt( + tokens[item["token"]].content, item["pos"], openQuote + ) + + pos += len(closeQuote) - 1 + if item["token"] == i: + pos += len(openQuote) - 1 + + text = token.content + maximum = len(text) + + stack = stack[:j] + goto_outer = True + break + if goto_outer: + goto_outer = False + continue + + if canOpen: + stack.append( + { + "token": i, + "pos": t.start(0) + lastIndex, + "single": isSingle, + "level": thisLevel, + } + ) + elif canClose and isSingle: + token.content = replaceAt( + token.content, t.start(0) + lastIndex, APOSTROPHE + ) + + +def smartquotes(state: StateCore): + if not state.md.options.typographer: + return + + for token in state.tokens: + + if token.type != "inline" or not QUOTE_RE.search(token.content): + continue + + process_inlines(token.children, state) diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py index 3eebdf60..0056ebc7 100644 --- a/tests/test_api/test_main.py +++ b/tests/test_api/test_main.py @@ -7,7 +7,7 @@ def test_get_rules(): md = MarkdownIt("zero") # print(md.get_all_rules()) assert md.get_all_rules() == { - "core": ["normalize", "block", "inline", "replacements"], + "core": ["normalize", "block", "inline", "replacements", "smartquotes"], "block": [ "table", "code", diff --git a/tests/test_port/fixtures/smartquotes.md b/tests/test_port/fixtures/smartquotes.md index b7a56312..70378b8e 100644 --- a/tests/test_port/fixtures/smartquotes.md +++ b/tests/test_port/fixtures/smartquotes.md @@ -132,9 +132,9 @@ and "that". "this" and\ "that". . -

“this”
+

“this”
and “that”.

-

“this” and
+

“this” and
“that”.

. @@ -144,3 +144,23 @@ The dog---"'man's' best friend" .

The dog—“‘man’s’ best friend”

. + +Should parse quotes adjacent to code block, #677: +. +"test `code`" + +"`code` test" +. +

“test code

+

code test”

+. + +Should parse quotes adjacent to inline html, #677: +. +"test
" + +"
test" +. +

“test

+


test”

+. \ No newline at end of file diff --git a/tests/test_port/test_fixtures.py b/tests/test_port/test_fixtures.py index 68c09fb7..0475f20c 100644 --- a/tests/test_port/test_fixtures.py +++ b/tests/test_port/test_fixtures.py @@ -8,6 +8,17 @@ FIXTURE_PATH = Path(__file__).parent.joinpath("fixtures") +@pytest.mark.parametrize( + "line,title,input,expected", + read_fixture_file(FIXTURE_PATH.joinpath("smartquotes.md")), +) +def test_smartquotes(line, title, input, expected): + md = MarkdownIt().enable("replacements").enable("smartquotes") + md.options["typographer"] = True + text = md.render(input) + assert text.rstrip() == expected.rstrip() + + @pytest.mark.parametrize( "line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("typographer.md")),