From cf3352c8a224af219cb062cfe467d7da9289284a Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Fri, 28 Feb 2020 18:01:24 +1100 Subject: [PATCH] Add performance benchmark (#88) Add CLI and documentation, benchmarking `myst-parser` against other Python based markdown parsers. --- .vscode/settings.json | 7 +- docs/conf.py | 1 + docs/using/benchmark.md | 1000 +++++++++++++++++++++++++ docs/using/index.md | 1 + docs/using/use_api.md | 18 + docstring.fmt.mustache | 20 + myst_parser/__init__.py | 23 + myst_parser/block_tokens.py | 27 +- myst_parser/cli/__init__.py | 0 myst_parser/cli/benchmark.py | 113 +++ myst_parser/docutils_renderer.py | 15 +- setup.py | 4 +- tests/test_cli.py | 11 + tests/test_renderers/test_docutils.py | 13 +- 14 files changed, 1236 insertions(+), 17 deletions(-) create mode 100644 docs/using/benchmark.md create mode 100644 docstring.fmt.mustache create mode 100644 myst_parser/cli/__init__.py create mode 100644 myst_parser/cli/benchmark.py create mode 100644 tests/test_cli.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 2ee89ab6..db5a5166 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,9 +8,12 @@ "**/__pycache__": true, "**/.pytest_cache": true }, - "editor.rulers": [88], + "editor.rulers": [ + 88 + ], "python.formatting.provider": "black", "python.linting.pylintEnabled": false, "python.linting.flake8Enabled": true, "python.linting.enabled": true, -} + "autoDocstring.customTemplatePath": "docstring.fmt.mustache" +} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index f58a5ea5..3fdc5f7f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -117,6 +117,7 @@ def run_apidoc(app): ("py:class", "mistletoe.block_token.Table"), ("py:class", "mistletoe.block_token.Footnote"), ("py:class", "mistletoe.block_token.Paragraph"), + ("py:class", "mistletoe.block_token.ThematicBreak"), ("py:class", "mistletoe.base_renderer.BaseRenderer"), ("py:class", "mistletoe.html_renderer.HTMLRenderer"), ("py:class", "mistletoe.span_token.SpanToken"), diff --git a/docs/using/benchmark.md b/docs/using/benchmark.md new file mode 100644 index 00000000..9cfb8f7c --- /dev/null +++ b/docs/using/benchmark.md @@ -0,0 +1,1000 @@ +Parsing Performance Benchmark +============================= + +The following document describes Markdown syntax, [as written by John Gruber][src]. +It is used to benchmark the parsing speed of the MyST-Parser against +some existing Markdown parsers written in Python: + + [src]: https://daringfireball.net/projects/markdown/syntax + + $ myst-benchmark docs/using/benchmark.md + Test document: benchmark.md + Test iterations: 1000 + Running tests ... + ================= + python-markdown:extra (3.2): 53.78 s + mistune (0.8.4): 15.22 s + commonmark.py (0.9.1): 61.92 s + mistletoe (0.8.0): 43.51 s + myst_parser:html (0.2.0): 52.47 s + myst_parser:docutils (0.2.0): 65.27 s + myst_parser:sphinx (0.2.0): 73.54 s + +As already noted by [mistletoe](https://github.com/miyuchina/mistletoe#performance) +(which this package is built on), although Mistune is the fastest, +this is because it does not strictly follow the CommonMark spec, +which outlines a highly context-sensitive grammar for Markdown. +The simpler approach taken by Mistune means that it cannot handle more +complex parsing cases, such as precedence of different types of tokens, escaping rules, etc. + +The MyST parser is slightly slower than the base mistletoe parser, due to the additional +syntax which it parses. Then the conversion to docutils AST takes some more time, +but is still comparably performant to the core CommonMark.py parser. The sphinx +parse takes some extra time, due to loading the full sphinx environment, +including its roles and directives. + +## Contents + +* [Overview](#overview) + * [Philosophy](#philosophy) + * [Inline HTML](#html) + * [Automatic Escaping for Special Characters](#autoescape) +* [Block Elements](#block) + * [Paragraphs and Line Breaks](#p) + * [Headers](#header) + * [Blockquotes](#blockquote) + * [Lists](#list) + * [Code Blocks](#precode) + * [Horizontal Rules](#hr) +* [Span Elements](#span) + * [Links](#link) + * [Emphasis](#em) + * [Code](#code) + * [Images](#img) +* [Miscellaneous](#misc) + * [Backslash Escapes](#backslash) + * [Automatic Links](#autolink) + +* * * + +

Overview

+ +

Philosophy

+ +Markdown is intended to be as easy-to-read and easy-to-write as is feasible. + +Readability, however, is emphasized above all else. A Markdown-formatted +document should be publishable as-is, as plain text, without looking +like it's been marked up with tags or formatting instructions. While +Markdown's syntax has been influenced by several existing text-to-HTML +filters -- including [Setext] [1], [atx] [2], [Textile] [3], [reStructuredText] [4], +[Grutatext] [5], and [EtText] [6] -- the single biggest source of +inspiration for Markdown's syntax is the format of plain text email. + + [1]: http://docutils.sourceforge.net/mirror/setext.html + [2]: http://www.aaronsw.com/2002/atx/ + [3]: http://textism.com/tools/textile/ + [4]: http://docutils.sourceforge.net/rst.html + [5]: http://www.triptico.com/software/grutatxt.html + [6]: http://ettext.taint.org/doc/ + +To this end, Markdown's syntax is comprised entirely of punctuation +characters, which punctuation characters have been carefully chosen so +as to look like what they mean. E.g., asterisks around a word actually +look like \*emphasis\*. Markdown lists look like, well, lists. Even +blockquotes look like quoted passages of text, assuming you've ever +used email. + +

Inline HTML

+ +Markdown's syntax is intended for one purpose: to be used as a +format for *writing* for the web. + +Markdown is not a replacement for HTML, or even close to it. Its +syntax is very small, corresponding only to a very small subset of +HTML tags. The idea is *not* to create a syntax that makes it easier +to insert HTML tags. In my opinion, HTML tags are already easy to +insert. The idea for Markdown is to make it easy to read, write, and +edit prose. HTML is a *publishing* format; Markdown is a *writing* +format. Thus, Markdown's formatting syntax only addresses issues that +can be conveyed in plain text. + +For any markup that is not covered by Markdown's syntax, you simply +use HTML itself. There's no need to preface it or delimit it to +indicate that you're switching from Markdown to HTML; you just use +the tags. + +The only restrictions are that block-level HTML elements -- e.g. `
`, +``, `
`, `

`, etc. -- must be separated from surrounding +content by blank lines, and the start and end tags of the block should +not be indented with tabs or spaces. Markdown is smart enough not +to add extra (unwanted) `

` tags around HTML block-level tags. + +For example, to add an HTML table to a Markdown article: + + This is a regular paragraph. + +

+ + + +
Foo
+ + This is another regular paragraph. + +Note that Markdown formatting syntax is not processed within block-level +HTML tags. E.g., you can't use Markdown-style `*emphasis*` inside an +HTML block. + +Span-level HTML tags -- e.g. ``, ``, or `` -- can be +used anywhere in a Markdown paragraph, list item, or header. If you +want, you can even use HTML tags instead of Markdown formatting; e.g. if +you'd prefer to use HTML `` or `` tags instead of Markdown's +link or image syntax, go right ahead. + +Unlike block-level HTML tags, Markdown syntax *is* processed within +span-level tags. + + +

Automatic Escaping for Special Characters

+ +In HTML, there are two characters that demand special treatment: `<` +and `&`. Left angle brackets are used to start tags; ampersands are +used to denote HTML entities. If you want to use them as literal +characters, you must escape them as entities, e.g. `<`, and +`&`. + +Ampersands in particular are bedeviling for web writers. If you want to +write about 'AT&T', you need to write '`AT&T`'. You even need to +escape ampersands within URLs. Thus, if you want to link to: + + http://images.google.com/images?num=30&q=larry+bird + +you need to encode the URL as: + + http://images.google.com/images?num=30&q=larry+bird + +in your anchor tag `href` attribute. Needless to say, this is easy to +forget, and is probably the single most common source of HTML validation +errors in otherwise well-marked-up web sites. + +Markdown allows you to use these characters naturally, taking care of +all the necessary escaping for you. If you use an ampersand as part of +an HTML entity, it remains unchanged; otherwise it will be translated +into `&`. + +So, if you want to include a copyright symbol in your article, you can write: + + © + +and Markdown will leave it alone. But if you write: + + AT&T + +Markdown will translate it to: + + AT&T + +Similarly, because Markdown supports [inline HTML](#html), if you use +angle brackets as delimiters for HTML tags, Markdown will treat them as +such. But if you write: + + 4 < 5 + +Markdown will translate it to: + + 4 < 5 + +However, inside Markdown code spans and blocks, angle brackets and +ampersands are *always* encoded automatically. This makes it easy to use +Markdown to write about HTML code. (As opposed to raw HTML, which is a +terrible format for writing about HTML syntax, because every single `<` +and `&` in your example code needs to be escaped.) + + +* * * + + +

Block Elements

+ + +

Paragraphs and Line Breaks

+ +A paragraph is simply one or more consecutive lines of text, separated +by one or more blank lines. (A blank line is any line that looks like a +blank line -- a line containing nothing but spaces or tabs is considered +blank.) Normal paragraphs should not be indented with spaces or tabs. + +The implication of the "one or more consecutive lines of text" rule is +that Markdown supports "hard-wrapped" text paragraphs. This differs +significantly from most other text-to-HTML formatters (including Movable +Type's "Convert Line Breaks" option) which translate every line break +character in a paragraph into a `
` tag. + +When you *do* want to insert a `
` break tag using Markdown, you +end a line with two or more spaces, then type return. + +Yes, this takes a tad more effort to create a `
`, but a simplistic +"every line break is a `
`" rule wouldn't work for Markdown. +Markdown's email-style [blockquoting][bq] and multi-paragraph [list items][l] +work best -- and look better -- when you format them with hard breaks. + + [bq]: #blockquote + [l]: #list + + + + + +Markdown supports two styles of headers, [Setext] [1] and [atx] [2]. + +Setext-style headers are "underlined" using equal signs (for first-level +headers) and dashes (for second-level headers). For example: + + This is an H1 + ============= + + This is an H2 + ------------- + +This is an H1 +============= + +This is an H2 +------------- + +Any number of underlining `=`'s or `-`'s will work. + +Atx-style headers use 1-6 hash characters at the start of the line, +corresponding to header levels 1-6. For example: + + # This is an H1 + + ## This is an H2 + + ###### This is an H6 + +Optionally, you may "close" atx-style headers. This is purely +cosmetic -- you can use this if you think it looks better. The +closing hashes don't even need to match the number of hashes +used to open the header. (The number of opening hashes +determines the header level.) : + + # This is an H1 # + + ## This is an H2 ## + + ### This is an H3 ###### + + +

Blockquotes

+ +Markdown uses email-style `>` characters for blockquoting. If you're +familiar with quoting passages of text in an email message, then you +know how to create a blockquote in Markdown. It looks best if you hard +wrap the text and put a `>` before every line: + + > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, + > consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. + > Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + > + > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse + > id sem consectetuer libero luctus adipiscing. + +* * * + +> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, +> consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. +> Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. +> +> Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse +> id sem consectetuer libero luctus adipiscing. + +* * * + +Markdown allows you to be lazy and only put the `>` before the first +line of a hard-wrapped paragraph: + + > This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, + consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. + Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + + > Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse + id sem consectetuer libero luctus adipiscing. + +* * * + +> This is a blockquote with two paragraphs. Lorem ipsum dolor sit amet, +consectetuer adipiscing elit. Aliquam hendrerit mi posuere lectus. +Vestibulum enim wisi, viverra nec, fringilla in, laoreet vitae, risus. + +> Donec sit amet nisl. Aliquam semper ipsum sit amet velit. Suspendisse +id sem consectetuer libero luctus adipiscing. + +* * * + +Blockquotes can be nested (i.e. a blockquote-in-a-blockquote) by +adding additional levels of `>`: + + > This is the first level of quoting. + > + > > This is nested blockquote. + > + > Back to the first level. + +* * * + +> This is the first level of quoting. +> +> > This is nested blockquote. +> +> Back to the first level. + +* * * + +Blockquotes can contain other Markdown elements, including headers, lists, +and code blocks: + + > ## This is a header. + > + > 1. This is the first list item. + > 2. This is the second list item. + > + > Here's some example code: + > + > return shell_exec("echo $input | $markdown_script"); + +Any decent text editor should make email-style quoting easy. For +example, with BBEdit, you can make a selection and choose Increase +Quote Level from the Text menu. + + +

Lists

+ +Markdown supports ordered (numbered) and unordered (bulleted) lists. + +Unordered lists use asterisks, pluses, and hyphens -- interchangably +-- as list markers: + + * Red + * Green + * Blue + +* * * + +* Red +* Green +* Blue + +* * * + +is equivalent to: + + + Red + + Green + + Blue + +* * * + ++ Red ++ Green ++ Blue + +* * * + +and: + + - Red + - Green + - Blue + +* * * + +- Red +- Green +- Blue + +* * * + +Ordered lists use numbers followed by periods: + + 1. Bird + 2. McHale + 3. Parish + +* * * + +1. Bird +2. McHale +3. Parish + +* * * + +It's important to note that the actual numbers you use to mark the +list have no effect on the HTML output Markdown produces. The HTML +Markdown produces from the above list is: + +
    +
  1. Bird
  2. +
  3. McHale
  4. +
  5. Parish
  6. +
+ +If you instead wrote the list in Markdown like this: + + 1. Bird + 2. McHale + 3. Parish + +or even: + + 3. Bird + 1. McHale + 8. Parish + +* * * + +1. Bird +2. McHale +3. Parish + +* * * + +you'd get the exact same HTML output. The point is, if you want to, +you can use ordinal numbers in your ordered Markdown lists, so that +the numbers in your source match the numbers in your published HTML. +But if you want to be lazy, you don't have to. + +If you do use lazy list numbering, however, you should still start the +list with the number 1. At some point in the future, Markdown may support +starting ordered lists at an arbitrary number. + +List markers typically start at the left margin, but may be indented by +up to three spaces. List markers must be followed by one or more spaces +or a tab. + +To make lists look nice, you can wrap items with hanging indents: + + * Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, + viverra nec, fringilla in, laoreet vitae, risus. + * Donec sit amet nisl. Aliquam semper ipsum sit amet velit. + Suspendisse id sem consectetuer libero luctus adipiscing. + +But if you want to be lazy, you don't have to: + + * Lorem ipsum dolor sit amet, consectetuer adipiscing elit. + Aliquam hendrerit mi posuere lectus. Vestibulum enim wisi, + viverra nec, fringilla in, laoreet vitae, risus. + * Donec sit amet nisl. Aliquam semper ipsum sit amet velit. + Suspendisse id sem consectetuer libero luctus adipiscing. + +If list items are separated by blank lines, Markdown will wrap the +items in `

` tags in the HTML output. For example, this input: + + * Bird + * Magic + +will turn into: + +

    +
  • Bird
  • +
  • Magic
  • +
+ +But this: + + * Bird + + * Magic + +will turn into: + +
    +
  • Bird

  • +
  • Magic

  • +
+ +List items may consist of multiple paragraphs. Each subsequent +paragraph in a list item must be indented by either 4 spaces +or one tab: + + 1. This is a list item with two paragraphs. Lorem ipsum dolor + sit amet, consectetuer adipiscing elit. Aliquam hendrerit + mi posuere lectus. + + Vestibulum enim wisi, viverra nec, fringilla in, laoreet + vitae, risus. Donec sit amet nisl. Aliquam semper ipsum + sit amet velit. + + 2. Suspendisse id sem consectetuer libero luctus adipiscing. + +It looks nice if you indent every line of the subsequent +paragraphs, but here again, Markdown will allow you to be +lazy: + + * This is a list item with two paragraphs. + + This is the second paragraph in the list item. You're + only required to indent the first line. Lorem ipsum dolor + sit amet, consectetuer adipiscing elit. + + * Another item in the same list. + +To put a blockquote within a list item, the blockquote's `>` +delimiters need to be indented: + + * A list item with a blockquote: + + > This is a blockquote + > inside a list item. + +* * * + +* A list item with a blockquote: + + > This is a blockquote + > inside a list item. + +* * * + +To put a code block within a list item, the code block needs +to be indented *twice* -- 8 spaces or two tabs: + + * A list item with a code block: + + + + +It's worth noting that it's possible to trigger an ordered list by +accident, by writing something like this: + + 1. What a great season. + +In other words, a *number-period-space* sequence at the beginning of a +line. To avoid this, you can backslash-escape the period: + + 1986\. What a great season. + + + +

Code Blocks

+ +Pre-formatted code blocks are used for writing about programming or +markup source code. Rather than forming normal paragraphs, the lines +of a code block are interpreted literally. Markdown wraps a code block +in both `
` and `` tags.
+
+To produce a code block in Markdown, simply indent every line of the
+block by at least 4 spaces or 1 tab. For example, given this input:
+
+    This is a normal paragraph:
+
+        This is a code block.
+
+Markdown will generate:
+
+    

This is a normal paragraph:

+ +
This is a code block.
+    
+ +One level of indentation -- 4 spaces or 1 tab -- is removed from each +line of the code block. For example, this: + + Here is an example of AppleScript: + + tell application "Foo" + beep + end tell + +will turn into: + +

Here is an example of AppleScript:

+ +
tell application "Foo"
+        beep
+    end tell
+    
+ +A code block continues until it reaches a line that is not indented +(or the end of the article). + +Within a code block, ampersands (`&`) and angle brackets (`<` and `>`) +are automatically converted into HTML entities. This makes it very +easy to include example HTML source code using Markdown -- just paste +it and indent it, and Markdown will handle the hassle of encoding the +ampersands and angle brackets. For example, this: + + + +will turn into: + +
<div class="footer">
+        &copy; 2004 Foo Corporation
+    </div>
+    
+ +Regular Markdown syntax is not processed within code blocks. E.g., +asterisks are just literal asterisks within a code block. This means +it's also easy to use Markdown to write about Markdown's own syntax. + + + +

Horizontal Rules

+ +You can produce a horizontal rule tag (`
`) by placing three or +more hyphens, asterisks, or underscores on a line by themselves. If you +wish, you may use spaces between the hyphens or asterisks. Each of the +following lines will produce a horizontal rule: + + * * * + + *** + + ***** + + - - - + + --------------------------------------- + + +* * * + +

Span Elements

+ + + +Markdown supports two style of links: *inline* and *reference*. + +In both styles, the link text is delimited by [square brackets]. + +To create an inline link, use a set of regular parentheses immediately +after the link text's closing square bracket. Inside the parentheses, +put the URL where you want the link to point, along with an *optional* +title for the link, surrounded in quotes. For example: + + This is [an example](http://example.com/ "Title") inline link. + + [This link](http://example.net/) has no title attribute. + +Will produce: + +

This is + an example inline link.

+ +

This link has no + title attribute.

+ +If you're referring to a local resource on the same server, you can +use relative paths: + + See my [About](/about/) page for details. + +Reference-style links use a second set of square brackets, inside +which you place a label of your choosing to identify the link: + + This is [an example][id] reference-style link. + +You can optionally use a space to separate the sets of brackets: + + This is [an example] [id] reference-style link. + +Then, anywhere in the document, you define your link label like this, +on a line by itself: + + [id]: http://example.com/ "Optional Title Here" + +That is: + +* Square brackets containing the link identifier (optionally + indented from the left margin using up to three spaces); +* followed by a colon; +* followed by one or more spaces (or tabs); +* followed by the URL for the link; +* optionally followed by a title attribute for the link, enclosed + in double or single quotes, or enclosed in parentheses. + +The following three link definitions are equivalent: + + [foo]: http://example.com/ "Optional Title Here" + [foo]: http://example.com/ 'Optional Title Here' + [foo]: http://example.com/ (Optional Title Here) + +**Note:** There is a known bug in Markdown.pl 1.0.1 which prevents +single quotes from being used to delimit link titles. + +The link URL may, optionally, be surrounded by angle brackets: + + [id]: "Optional Title Here" + +You can put the title attribute on the next line and use extra spaces +or tabs for padding, which tends to look better with longer URLs: + + [id]: http://example.com/longish/path/to/resource/here + "Optional Title Here" + +Link definitions are only used for creating links during Markdown +processing, and are stripped from your document in the HTML output. + +Link definition names may consist of letters, numbers, spaces, and +punctuation -- but they are *not* case sensitive. E.g. these two +links: + + [link text][a] + [link text][A] + +are equivalent. + +The *implicit link name* shortcut allows you to omit the name of the +link, in which case the link text itself is used as the name. +Just use an empty set of square brackets -- e.g., to link the word +"Google" to the google.com web site, you could simply write: + + [Google][] + +And then define the link: + + [Google]: http://google.com/ + +Because link names may contain spaces, this shortcut even works for +multiple words in the link text: + + Visit [Daring Fireball][] for more information. + +And then define the link: + + [Daring Fireball]: http://daringfireball.net/ + +Link definitions can be placed anywhere in your Markdown document. I +tend to put them immediately after each paragraph in which they're +used, but if you want, you can put them all at the end of your +document, sort of like footnotes. + +Here's an example of reference links in action: + + I get 10 times more traffic from [Google] [1] than from + [Yahoo] [2] or [MSN] [3]. + + [1]: http://google.com/ "Google" + [2]: http://search.yahoo.com/ "Yahoo Search" + [3]: http://search.msn.com/ "MSN Search" + +Using the implicit link name shortcut, you could instead write: + + I get 10 times more traffic from [Google][] than from + [Yahoo][] or [MSN][]. + + [google]: http://google.com/ "Google" + [yahoo]: http://search.yahoo.com/ "Yahoo Search" + [msn]: http://search.msn.com/ "MSN Search" + +Both of the above examples will produce the following HTML output: + +

I get 10 times more traffic from Google than from + Yahoo + or MSN.

+ +For comparison, here is the same paragraph written using +Markdown's inline link style: + + I get 10 times more traffic from [Google](http://google.com/ "Google") + than from [Yahoo](http://search.yahoo.com/ "Yahoo Search") or + [MSN](http://search.msn.com/ "MSN Search"). + +The point of reference-style links is not that they're easier to +write. The point is that with reference-style links, your document +source is vastly more readable. Compare the above examples: using +reference-style links, the paragraph itself is only 81 characters +long; with inline-style links, it's 176 characters; and as raw HTML, +it's 234 characters. In the raw HTML, there's more markup than there +is text. + +With Markdown's reference-style links, a source document much more +closely resembles the final output, as rendered in a browser. By +allowing you to move the markup-related metadata out of the paragraph, +you can add links without interrupting the narrative flow of your +prose. + + +

Emphasis

+ +Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +emphasis. Text wrapped with one `*` or `_` will be wrapped with an +HTML `` tag; double `*`'s or `_`'s will be wrapped with an HTML +`` tag. E.g., this input: + + *single asterisks* + + _single underscores_ + + **double asterisks** + + __double underscores__ + +will produce: + + single asterisks + + single underscores + + double asterisks + + double underscores + +You can use whichever style you prefer; the lone restriction is that +the same character must be used to open and close an emphasis span. + +Emphasis can be used in the middle of a word: + + un*frigging*believable + +But if you surround an `*` or `_` with spaces, it'll be treated as a +literal asterisk or underscore. + +To produce a literal asterisk or underscore at a position where it +would otherwise be used as an emphasis delimiter, you can backslash +escape it: + + \*this text is surrounded by literal asterisks\* + + + +

Code

+ +To indicate a span of code, wrap it with backtick quotes (`` ` ``). +Unlike a pre-formatted code block, a code span indicates code within a +normal paragraph. For example: + + Use the `printf()` function. + +will produce: + +

Use the printf() function.

+ +To include a literal backtick character within a code span, you can use +multiple backticks as the opening and closing delimiters: + + ``There is a literal backtick (`) here.`` + +which will produce this: + +

There is a literal backtick (`) here.

+ +The backtick delimiters surrounding a code span may include spaces -- +one after the opening, one before the closing. This allows you to place +literal backtick characters at the beginning or end of a code span: + + A single backtick in a code span: `` ` `` + + A backtick-delimited string in a code span: `` `foo` `` + +will produce: + +

A single backtick in a code span: `

+ +

A backtick-delimited string in a code span: `foo`

+ +With a code span, ampersands and angle brackets are encoded as HTML +entities automatically, which makes it easy to include example HTML +tags. Markdown will turn this: + + Please don't use any `` tags. + +into: + +

Please don't use any <blink> tags.

+ +You can write this: + + `—` is the decimal-encoded equivalent of `—`. + +to produce: + +

&#8212; is the decimal-encoded + equivalent of &mdash;.

+ + + +

Images

+ +Admittedly, it's fairly difficult to devise a "natural" syntax for +placing images into a plain text document format. + +Markdown uses an image syntax that is intended to resemble the syntax +for links, allowing for two styles: *inline* and *reference*. + +Inline image syntax looks like this: + + ![Alt text](/path/to/img.jpg) + + ![Alt text](/path/to/img.jpg "Optional title") + +That is: + +* An exclamation mark: `!`; +* followed by a set of square brackets, containing the `alt` + attribute text for the image; +* followed by a set of parentheses, containing the URL or path to + the image, and an optional `title` attribute enclosed in double + or single quotes. + +Reference-style image syntax looks like this: + + ![Alt text][id] + +Where "id" is the name of a defined image reference. Image references +are defined using syntax identical to link references: + + [id]: url/to/image "Optional title attribute" + +As of this writing, Markdown has no syntax for specifying the +dimensions of an image; if this is important to you, you can simply +use regular HTML `` tags. + + +* * * + + +

Miscellaneous

+ + + +Markdown supports a shortcut style for creating "automatic" links for URLs and email addresses: simply surround the URL or email address with angle brackets. What this means is that if you want to show the actual text of a URL or email address, and also have it be a clickable link, you can do this: + + + +Markdown will turn this into: + + http://example.com/ + +Automatic links for email addresses work similarly, except that +Markdown will also perform a bit of randomized decimal and hex +entity-encoding to help obscure your address from address-harvesting +spambots. For example, Markdown will turn this: + + + +into something like this: + + address@exa + mple.com + +which will render in a browser as a clickable link to "address@example.com". + +(This sort of entity-encoding trick will indeed fool many, if not +most, address-harvesting bots, but it definitely won't fool all of +them. It's better than nothing, but an address published in this way +will probably eventually start receiving spam.) + + + +

Backslash Escapes

+ +Markdown allows you to use backslash escapes to generate literal +characters which would otherwise have special meaning in Markdown's +formatting syntax. For example, if you wanted to surround a word +with literal asterisks (instead of an HTML `` tag), you can use +backslashes before the asterisks, like this: + + \*literal asterisks\* + +Markdown provides backslash escapes for the following characters: + + \ backslash + ` backtick + * asterisk + _ underscore + {} curly braces + [] square brackets + () parentheses + # hash mark + + plus sign + - minus sign (hyphen) + . dot + ! exclamation mark diff --git a/docs/using/index.md b/docs/using/index.md index e393b02b..d8ce3964 100644 --- a/docs/using/index.md +++ b/docs/using/index.md @@ -7,5 +7,6 @@ MyST documents. install.md sphinx.md syntax.md +benchmark.md use_api.md ``` diff --git a/docs/using/use_api.md b/docs/using/use_api.md index c05488b3..1c87cb87 100644 --- a/docs/using/use_api.md +++ b/docs/using/use_api.md @@ -6,6 +6,24 @@ MyST-Parser may be used as an API *via* the `myst_parser` package. The raw text is first parsed to syntax 'tokens', then these are converted to other formats using 'renderers'. +The simplest way to parse text is using: + +```python +from myst_parser import parse_text +parse_text("some *text*", "html") +``` + +```html +'

some text

\n' +``` + +The output type can be one of: + +- `dict` (a.k.a ast) +- `html` +- `docutils` +- `sphinx` + ## Convert Text to Tokens To convert some text to tokens: diff --git a/docstring.fmt.mustache b/docstring.fmt.mustache new file mode 100644 index 00000000..717a4572 --- /dev/null +++ b/docstring.fmt.mustache @@ -0,0 +1,20 @@ +{{! Sphinx Docstring Template }} +{{summaryPlaceholder}} + +{{extendedSummaryPlaceholder}} + +{{#args}} +:param {{var}}: {{descriptionPlaceholder}} +{{/args}} +{{#kwargs}} +:param {{var}}: {{descriptionPlaceholder}} +{{/kwargs}} +{{#exceptions}} +:raises {{type}}: {{descriptionPlaceholder}} +{{/exceptions}} +{{#returns}} +:return: {{descriptionPlaceholder}} +{{/returns}} +{{#yields}} +:yield: {{descriptionPlaceholder}} +{{/yields}} diff --git a/myst_parser/__init__.py b/myst_parser/__init__.py index e26a853d..d5c142b6 100644 --- a/myst_parser/__init__.py +++ b/myst_parser/__init__.py @@ -17,6 +17,29 @@ def render_tokens(root_token, renderer, **kwargs): return renderer.render(root_token) +def parse_text(text: str, output_type: str, **kwargs): + """Convert MyST text to another format. + + :param text: the text to convert + :param output_type: one of 'dict', 'html', 'docutils', 'sphinx' + :param kwargs: parsed to the render initiatiation + """ + if output_type == "dict": + from myst_parser.ast_renderer import AstRenderer as renderer_cls + elif output_type == "html": + from myst_parser.html_renderer import HTMLRenderer as renderer_cls + elif output_type == "docutils": + from myst_parser.docutils_renderer import DocutilsRenderer as renderer_cls + elif output_type == "sphinx": + from myst_parser.docutils_renderer import SphinxRenderer as renderer_cls + else: + raise ValueError("output_type not recognised: {}".format(output_type)) + from myst_parser.block_tokens import Document + + with renderer_cls(**kwargs) as renderer: + return renderer.render(Document(text)) + + def setup(app): """Initialize Sphinx extension.""" from myst_parser.sphinx_parser import MystParser diff --git a/myst_parser/block_tokens.py b/myst_parser/block_tokens.py index c417b05b..357b3c36 100644 --- a/myst_parser/block_tokens.py +++ b/myst_parser/block_tokens.py @@ -3,13 +3,7 @@ from mistletoe import block_token, span_token import mistletoe.block_tokenizer as tokenizer -from mistletoe.block_token import ( # noqa: F401 - tokenize, - HTMLBlock, - ThematicBreak, - Footnote, - TableRow, -) +from mistletoe.block_token import tokenize, HTMLBlock, Footnote, TableRow # noqa: F401 """ Tokens to be included in the parsing process, in the order specified. @@ -147,6 +141,25 @@ def __repr__(self): return "MyST.{}(range={})".format(self.__class__.__name__, self.range) +class ThematicBreak(block_token.ThematicBreak): + """ + Thematic break token (a.k.a. horizontal rule.) + """ + + def __init__(self, result): + line, lineno = result + self.raw = line.splitlines()[0] + self.range = (lineno, lineno) + + @classmethod + def read(cls, lines): + line = next(lines) + return line, lines.lineno + + def __repr__(self): + return "MyST.{}(range={})".format(self.__class__.__name__, self.range) + + class BlockBreak(block_token.BlockToken): """Block break token ``+++``. diff --git a/myst_parser/cli/__init__.py b/myst_parser/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/myst_parser/cli/benchmark.py b/myst_parser/cli/benchmark.py new file mode 100644 index 00000000..9603166b --- /dev/null +++ b/myst_parser/cli/benchmark.py @@ -0,0 +1,113 @@ +import argparse +from importlib import import_module +import os +import re +from time import perf_counter + +ALL_PACKAGES = ( + "python-markdown:extra", + "mistune", + "commonmark.py", + "mistletoe", + "myst_parser:html", + "myst_parser:docutils", + "myst_parser:sphinx", +) + + +def benchmark(package_name, version=None): + def decorator(func): + def inner(text, num_parses): + try: + package = import_module(package_name) + print("(" + (version or package.__version__) + ")", end=": ") + except ImportError: + return "not available." + start = perf_counter() + for i in range(num_parses): + func(package, text) + end = perf_counter() + + return end - start + + return inner + + return decorator + + +@benchmark("markdown") +def run_python_markdown_extra(package, text): + return package.markdown(text, extensions=["extra"]) + + +@benchmark("mistune") +def run_mistune(package, text): + return package.markdown(text) + + +@benchmark("commonmark", "0.9.1") +def run_commonmark_py(package, text): + return package.commonmark(text) + + +@benchmark("mistletoe") +def run_mistletoe(package, text): + return package.markdown(text) + + +@benchmark("myst_parser") +def run_myst_parser_html(package, text): + package.parse_text(text, "html") + + +@benchmark("myst_parser") +def run_myst_parser_docutils(package, text): + package.parse_text(text, "docutils", config={"ignore_missing_refs": True}) + + +@benchmark("myst_parser") +def run_myst_parser_sphinx(package, text): + package.parse_text(text, "sphinx", load_sphinx_env=True) + + +def run_all(package_names, text, num_parses): + prompt = "Running {} test(s) ...".format(len(package_names)) + print(prompt) + print("=" * len(prompt)) + for package_name in package_names: + print(package_name, end=" ") + func_name = re.sub(r"[\.\-\:]", "_", package_name.lower()) + print( + "{:.2f} s".format(globals()["run_{}".format(func_name)](text, num_parses)) + ) + return True + + +def main(args=None): + parser = argparse.ArgumentParser(description="Run benchmark test.") + parser.add_argument("path", type=str, help="the path to the file to parse") + parser.add_argument( + "-n", + "--num-parses", + metavar="NPARSES", + default=1000, + type=int, + help="The number of parse iterations (default: 1000)", + ) + parser.add_argument( + "-p", + "--package", + action="append", + default=[], + help="The package(s) to run (use -p multiple times).", + choices=ALL_PACKAGES, + metavar="PACKAGE_NAME", + ) + args = parser.parse_args(args) + + assert os.path.exists(args.path), "path does not exist" + print("Test document: {}".format(os.path.basename(args.path))) + print("Test iterations: {}".format(args.num_parses)) + with open(args.path, "r") as handle: + text = handle.read() + return run_all(args.package or ALL_PACKAGES, text, args.num_parses) diff --git a/myst_parser/docutils_renderer.py b/myst_parser/docutils_renderer.py index c015a237..190e1bc8 100644 --- a/myst_parser/docutils_renderer.py +++ b/myst_parser/docutils_renderer.py @@ -186,7 +186,9 @@ def render_strikethrough(self, token): raise NotImplementedError def render_thematic_break(self, token): - self.current_node.append(nodes.transition()) + node = nodes.transition() + self.add_line_and_source_path(node, token) + self.current_node.append(node) def render_block_break(self, token): block_break = nodes.comment(token.content, token.content) @@ -284,11 +286,14 @@ def render_heading(self, token): self.document.note_implicit_target(section, section) self.current_node = section - def handle_cross_reference(self, token, destination, ref_node): + def handle_cross_reference(self, token, destination): # TODO use the docutils error reporting mechanisms, rather than raising - raise NotImplementedError( - "reference not found in current document: {}".format(destination) - ) + if not self.config.get("ignore_missing_refs", False): + raise NotImplementedError( + "reference not found in current document: {}\n{}".format( + destination, token + ) + ) def render_link(self, token): ref_node = nodes.reference() diff --git a/setup.py b/setup.py index 8e527bb4..4e412994 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,9 @@ author_email="chrisj_sewell@hotmail.com", license="MIT", packages=find_packages(), - entry_points={"console_scripts": []}, + entry_points={ + "console_scripts": ["myst-benchmark = myst_parser.cli.benchmark:main"] + }, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..4b069ca9 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,11 @@ +import pathlib +import tempfile + +from myst_parser.cli import benchmark + + +def test_benchmark(): + with tempfile.TemporaryDirectory() as tempdir: + path = pathlib.Path(tempdir).joinpath("test.md") + path.write_text("a b c") + assert benchmark.main(["-n", "1", "-p", "myst_parser:html", str(path)]) diff --git a/tests/test_renderers/test_docutils.py b/tests/test_renderers/test_docutils.py index 31963b00..60698363 100644 --- a/tests/test_renderers/test_docutils.py +++ b/tests/test_renderers/test_docutils.py @@ -4,7 +4,7 @@ from mistletoe.block_token import tokenize from mistletoe.span_token import tokenize_inner -from myst_parser import text_to_tokens, render_tokens +from myst_parser import text_to_tokens, render_tokens, parse_text from myst_parser.block_tokens import Document from myst_parser.docutils_renderer import SphinxRenderer @@ -21,7 +21,7 @@ def render_token( render_func(mock_token) -def test_render_tokens(): +def test_text_to_tokens(): root = text_to_tokens("abc") document = render_tokens( root, @@ -34,6 +34,15 @@ def test_render_tokens(): ) +def test_parse_text(): + document = parse_text( + "abc", "sphinx", load_sphinx_env=True, sphinx_conf={"project": "MyST Parser"} + ) + assert document.pformat() == ( + '\n \n abc\n' + ) + + def test_strong(renderer_mock): render_token(renderer_mock, "Strong") assert renderer_mock.document.pformat() == dedent(