Refactor AutolinkExtensionSyntax (#471)

chenzhiguang · web-flow · commit 49eefd211e78 · 2022-10-31T09:45:58.000-07:00
* Refactor AutolinkExtensionSyntax Fix https://github.com/dart-lang/markdown/issues/470 * Improve readability * optimise a bit * Fix some requests from reveiw * Update gfm_stats.txt * An optimisation.
diff --git a/lib/src/inline_syntaxes/autolink_extension_syntax.dart b/lib/src/inline_syntaxes/autolink_extension_syntax.dart
@@ -3,134 +3,128 @@
 // BSD-style license that can be found in the LICENSE file.
 
 import '../ast.dart';
+import '../charcode.dart';
 import '../inline_parser.dart';
 import '../util.dart';
 import 'inline_syntax.dart';
 
-/// Matches autolinks like `http://foo.com`.
+/// Matches autolinks like `http://foo.com` and `foo@bar.com`.
 class AutolinkExtensionSyntax extends InlineSyntax {
-  /// Broken up parts of the autolink regex for reusability and readability
-
-  // Autolinks can only come at the beginning of a line, after whitespace, or
-  // any of the delimiting characters *, _, ~, and (.
-  static const start = r'(?:^|[\s*_~(>])';
-
-  // An extended url autolink will be recognized when one of the schemes
-  // http://, https://, or ftp://, followed by a valid domain
-  static const scheme = r'(?:(?:https?|ftp):\/\/|www\.)';
-
-  // A valid domain consists of alphanumeric characters, underscores (_),
-  // hyphens (-) and periods (.). There must be at least one period, and no
-  // underscores may be present in the last two segments of the domain.
-  static const domainPart = r'\w\-';
-  static const domain = '[$domainPart][$domainPart.]+';
-
-  // A valid domain consists of alphanumeric characters, underscores (_),
-  // hyphens (-) and periods (.).
-  static const path = r'[^\s<]*';
-
-  // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will not
-  // be considered part of the autolink
-  static const truncatingPunctuationPositive = '[?!.,:*_~]';
-
-  static final regExpTrailingPunc = RegExp('$truncatingPunctuationPositive*\$');
-  static final regExpEndsWithColon = RegExp(r'\&[a-zA-Z0-9]+;$');
-  static final regExpWhiteSpace = RegExp(r'\s');
-
-  AutolinkExtensionSyntax() : super('$start(($scheme)($domain)($path))');
+  static const _linkPattern =
+      // Autolinks can only come at the beginning of a line, after whitespace,
+      // or any of the delimiting characters *, _, ~, and (.
+      r'(?<=^|[\s*_~(>])'
+
+      // An extended url autolink will be recognised when one of the schemes
+      // http://, or https://, followed by a valid domain. See
+      // https://github.github.com/gfm/#extended-url-autolink.
+      r'(?:(?:https?|ftp):\/\/|www\.)'
+
+      // A valid domain consists of segments of alphanumeric characters,
+      // underscores (_) and hyphens (-) separated by periods (.). There must
+      // be at least one period, and no underscores may be present in the last
+      // two segments of the domain. See
+      // https://github.github.com/gfm/#valid-domain.
+      r'(?:[-_a-z0-9]+\.)*(?:[-a-z0-9]+\.[-a-z0-9]+)'
+
+      // After a valid domain, zero or more non-space non-< characters may
+      // follow.
+      r'[^\s<]*'
+
+      // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
+      // not be considered part of the autolink, though they may be included in
+      // the interior of the link. See
+      // https://github.github.com/gfm/#extended-autolink-path-validation.
+      '(?<![?!.,:*_~])';
+
+  // An extended email autolink, see
+  // https://github.github.com/gfm/#extended-email-autolink.
+  static const _emailPattern =
+      r'[-_.+a-z0-9]+@(?:[-_a-z0-9]+\.)+[-_a-z0-9]*[a-z0-9](?![-_])';
+
+  AutolinkExtensionSyntax()
+      : super(
+          '($_linkPattern)|($_emailPattern)',
+          caseSensitive: false,
+        );
 
   @override
   bool tryMatch(InlineParser parser, [int? startMatchPos]) {
-    return super.tryMatch(parser, parser.pos > 0 ? parser.pos - 1 : 0);
+    startMatchPos ??= parser.pos;
+    final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
+    if (startMatch == null) {
+      return false;
+    }
+    parser.writeText();
+    return onMatch(parser, startMatch);
   }
 
   @override
   bool onMatch(InlineParser parser, Match match) {
-    var url = match[1]!;
-    var href = url;
-    var matchLength = url.length;
-
-    if (url[0] == '>' || url.startsWith(regExpWhiteSpace)) {
-      url = url.substring(1, url.length - 1);
-      href = href.substring(1, href.length - 1);
-      parser.pos++;
-      matchLength--;
-    }
+    int consumeLength;
 
-    // Prevent accidental standard autolink matches
-    if (url.endsWith('>') && parser.source[parser.pos - 1] == '<') {
-      return false;
+    final isEmailLink = match[2] != null;
+    if (isEmailLink) {
+      consumeLength = match.match.length;
+    } else {
+      consumeLength = _getConsumeLength(match.match);
     }
 
-    // When an autolink ends in ), we scan the entire autolink for the total
-    // number of parentheses. If there is a greater number of closing
-    // parentheses than opening ones, we don’t consider the last character
-    // part of the autolink, in order to facilitate including an autolink
-    // inside a parenthesis:
-    // https://github.github.com/gfm/#example-600
-    if (url.endsWith(')')) {
-      final opening = _countChars(url, '(');
-      final closing = _countChars(url, ')');
-
-      if (closing > opening) {
-        url = url.substring(0, url.length - 1);
-        href = href.substring(0, href.length - 1);
-        matchLength--;
-      }
-    }
+    var text = match.match.substring(0, consumeLength);
+    text = parser.encodeHtml ? escapeHtml(text) : text;
 
-    // Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
-    // not be considered part of the autolink, though they may be included
-    // in the interior of the link:
-    // https://github.github.com/gfm/#example-599
-    final trailingPunc = regExpTrailingPunc.firstMatch(url);
-    if (trailingPunc != null) {
-      final trailingLength = trailingPunc.match.length;
-      url = url.substring(0, url.length - trailingLength);
-      href = href.substring(0, href.length - trailingLength);
-      matchLength -= trailingLength;
-    }
-
-    // If an autolink ends in a semicolon (;), we check to see if it appears
-    // to resemble an
-    // [entity reference](https://github.github.com/gfm/#entity-references);
-    // if the preceding text is & followed by one or more alphanumeric
-    // characters. If so, it is excluded from the autolink:
-    // https://github.github.com/gfm/#example-602
-    if (url.endsWith(';')) {
-      final entityRef = regExpEndsWithColon.firstMatch(url);
-      if (entityRef != null) {
-        // Strip out HTML entity reference
-        final entityRefLength = entityRef.match.length;
-        url = url.substring(0, url.length - entityRefLength);
-        href = href.substring(0, href.length - entityRefLength);
-        matchLength -= entityRefLength;
-      }
+    var destination = text;
+    if (isEmailLink) {
+      destination = 'mailto:$destination';
+    } else if (destination[0] == 'w') {
+      // When there is no scheme specified, insert the scheme `http`.
+      destination = 'http://$destination';
     }
 
-    // The scheme http will be inserted automatically
-    if (!href.startsWith('http://') &&
-        !href.startsWith('https://') &&
-        !href.startsWith('ftp://')) {
-      href = 'http://$href';
-    }
+    final anchor = Element.text('a', text)
+      ..attributes['href'] = Uri.encodeFull(destination);
 
-    final text = parser.encodeHtml ? escapeHtml(url) : url;
-    final anchor = Element.text('a', text);
-    anchor.attributes['href'] = Uri.encodeFull(href);
-    parser.addNode(anchor);
+    parser
+      ..addNode(anchor)
+      ..consume(consumeLength);
 
-    parser.consume(matchLength);
-    return false;
+    return true;
   }
 
-  int _countChars(String input, String char) {
-    var count = 0;
-
-    for (var i = 0; i < input.length; i++) {
-      if (input[i] == char) count++;
+  int _getConsumeLength(String text) {
+    var excludedLength = 0;
+
+    // When an autolink ends in `)`, see
+    // https://github.github.com/gfm/#example-625.
+    if (text.endsWith(')')) {
+      final match = RegExp(r'(\(.*)?(\)+)$').firstMatch(text)!;
+
+      if (match[1] == null) {
+        excludedLength = match[2]!.length;
+      } else {
+        var parenCount = 0;
+        for (var i = 0; i < text.length; i++) {
+          final char = text.codeUnitAt(i);
+          if (char == $lparen) {
+            parenCount++;
+          } else if (char == $rparen) {
+            parenCount--;
+          }
+        }
+        if (parenCount < 0) {
+          excludedLength = parenCount.abs();
+        }
+      }
+    }
+    // If an autolink ends in a semicolon `;`, see
+    // https://github.github.com/gfm/#example-627
+    else if (text.endsWith(';')) {
+      final match = RegExp(r'&[0-9a-z]+;$').firstMatch(text);
+      if (match != null) {
+        excludedLength = match.match.length;
+      }
     }
 
-    return count;
+    return text.length - excludedLength;
   }
 }
diff --git a/test/gfm/autolinks.unit b/test/gfm/autolinks.unit
@@ -57,7 +57,7 @@
 >>> Autolinks - 616
 < http://foo.bar >
 <<<
-<p>&lt; <a href="http://foo.bar">http://foo.bar</a> &gt;</p>
+<p>&lt; http://foo.bar &gt;</p>
 >>> Autolinks - 617
 <m:abc>
 <<<
@@ -69,7 +69,7 @@
 >>> Autolinks - 619
 http://example.com
 <<<
-<p><a href="http://example.com">http://example.com</a></p>
+<p>http://example.com</p>
 >>> Autolinks - 620
 foo@bar.example.com
 <<<
diff --git a/test/gfm/autolinks_extension.unit b/test/gfm/autolinks_extension.unit
@@ -23,9 +23,9 @@ www.google.com/search?q=Markup+(business)))
 (www.google.com/search?q=Markup+(business)
 <<<
 <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
-<p><a href="http://www.google.com/search?q=Markup+(business))">www.google.com/search?q=Markup+(business))</a>)</p>
 <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
-<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
+<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
 >>> Autolinks (extension) - 625
 www.google.com/search?q=(business))+ok
 <<<
@@ -35,7 +35,7 @@ www.google.com/search?q=commonmark&hl=en
 
 www.google.com/search?q=commonmark&hl;
 <<<
-<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
+<p><a href="http://www.google.com/search?q=commonmark&amp;hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
 <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p>
 >>> Autolinks (extension) - 627
 www.commonmark.org/he<lp
@@ -54,11 +54,11 @@ Anonymous FTP is available at ftp://foo.bar.baz.
 >>> Autolinks (extension) - 629
 foo@bar.baz
 <<<
-<p>foo@bar.baz</p>
+<p><a href="mailto:foo@bar.baz">foo@bar.baz</a></p>
 >>> Autolinks (extension) - 630
 hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.
 <<<
-<p>hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.</p>
+<p>hello@mail+xyz.example isn't valid, but <a href="mailto:hello+xyz@mail.example">hello+xyz@mail.example</a> is.</p>
 >>> Autolinks (extension) - 631
 a.b-c_d@a.b
 
@@ -68,7 +68,7 @@ a.b-c_d@a.b-
 
 a.b-c_d@a.b_
 <<<
-<p>a.b-c_d@a.b</p>
-<p>a.b-c_d@a.b.</p>
+<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a></p>
+<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a>.</p>
 <p>a.b-c_d@a.b-</p>
 <p>a.b-c_d@a.b_</p>
diff --git a/test/markdown_test.dart b/test/markdown_test.dart
@@ -55,7 +55,7 @@ void main() async {
   );
 
   testDirectory('common_mark');
-  testDirectory('gfm', extensionSet: ExtensionSet.gitHubFlavored);
+  testDirectory('gfm');
 
   group('Corner cases', () {
     validateCore('Incorrect Links', '''
diff --git a/test/util.dart b/test/util.dart
@@ -14,11 +14,40 @@ void testDirectory(String name, {ExtensionSet? extensionSet}) {
   for (final dataCase in dataCasesUnder(testDirectory: name)) {
     final description =
         '${dataCase.directory}/${dataCase.file}.unit ${dataCase.description}';
+
+    final inlineSyntaxes = <InlineSyntax>[];
+    final blockSyntaxes = <BlockSyntax>[];
+
+    if (dataCase.file.endsWith('_extension')) {
+      final extension = dataCase.file.substring(
+        0,
+        dataCase.file.lastIndexOf('_extension'),
+      );
+      switch (extension) {
+        case 'autolinks':
+          inlineSyntaxes.add(AutolinkExtensionSyntax());
+          break;
+        case 'strikethrough':
+          inlineSyntaxes.add(StrikethroughSyntax());
+          break;
+        case 'tables':
+          blockSyntaxes.add(const TableSyntax());
+          break;
+        case 'disallowed_raw_html':
+          // TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
+          break;
+        default:
+          throw UnimplementedError('Unimplemented extension "$extension"');
+      }
+    }
+
     validateCore(
       description,
       dataCase.input,
       dataCase.expectedOutput,
       extensionSet: extensionSet,
+      inlineSyntaxes: inlineSyntaxes,
+      blockSyntaxes: blockSyntaxes,
     );
   }
 }
diff --git a/tool/gfm_stats.json b/tool/gfm_stats.json
@@ -34,24 +34,24 @@
   "613": "strict",
   "614": "strict",
   "615": "strict",
-  "616": "fail",
+  "616": "strict",
   "617": "strict",
   "618": "strict",
-  "619": "fail",
+  "619": "strict",
   "620": "strict"
  },
  "Autolinks (extension)": {
   "621": "strict",
   "622": "strict",
   "623": "strict",
-  "624": "fail",
+  "624": "strict",
   "625": "strict",
   "626": "loose",
   "627": "strict",
   "628": "strict",
-  "629": "fail",
-  "630": "fail",
-  "631": "fail"
+  "629": "strict",
+  "630": "strict",
+  "631": "strict"
  },
  "Backslash escapes": {
   "308": "loose",
diff --git a/tool/gfm_stats.txt b/tool/gfm_stats.txt
@@ -1,6 +1,6 @@
   17 of   18 –  94.4%  ATX headings
-  17 of   19 –  89.5%  Autolinks
-   7 of   11 –  63.6%  Autolinks (extension)
+  19 of   19 – 100.0%  Autolinks
+  11 of   11 – 100.0%  Autolinks (extension)
   12 of   13 –  92.3%  Backslash escapes
    1 of    1 – 100.0%  Blank lines
   23 of   25 –  92.0%  Block quotes
@@ -28,5 +28,5 @@
   11 of   11 – 100.0%  Tabs
    3 of    3 – 100.0%  Textual content
   19 of   19 – 100.0%  Thematic breaks
- 634 of  671 –  94.5%  TOTAL
- 569 of  634 –  89.7%  TOTAL Strict
+ 640 of  671 –  95.4%  TOTAL
+ 575 of  640 –  89.8%  TOTAL Strict
diff --git a/tool/stats.dart b/tool/stats.dart
diff --git a/tool/stats_lib.dart b/tool/stats_lib.dart