Skip to content
This repository was archived by the owner on Feb 24, 2025. It is now read-only.

Commit 49eefd2

Browse files
authored
Refactor AutolinkExtensionSyntax (#471)
* Refactor AutolinkExtensionSyntax Fix https://github.com/dart-lang/markdown/issues/470 * Improve readability * optimise a bit * Fix some requests from reveiw * Update gfm_stats.txt * An optimisation.
1 parent 07e2683 commit 49eefd2

File tree

9 files changed

+191
-132
lines changed

9 files changed

+191
-132
lines changed

lib/src/inline_syntaxes/autolink_extension_syntax.dart

Lines changed: 99 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -3,134 +3,128 @@
33
// BSD-style license that can be found in the LICENSE file.
44

55
import '../ast.dart';
6+
import '../charcode.dart';
67
import '../inline_parser.dart';
78
import '../util.dart';
89
import 'inline_syntax.dart';
910

10-
/// Matches autolinks like `http://foo.com`.
11+
/// Matches autolinks like `http://foo.com` and `foo@bar.com`.
1112
class AutolinkExtensionSyntax extends InlineSyntax {
12-
/// Broken up parts of the autolink regex for reusability and readability
13-
14-
// Autolinks can only come at the beginning of a line, after whitespace, or
15-
// any of the delimiting characters *, _, ~, and (.
16-
static const start = r'(?:^|[\s*_~(>])';
17-
18-
// An extended url autolink will be recognized when one of the schemes
19-
// http://, https://, or ftp://, followed by a valid domain
20-
static const scheme = r'(?:(?:https?|ftp):\/\/|www\.)';
21-
22-
// A valid domain consists of alphanumeric characters, underscores (_),
23-
// hyphens (-) and periods (.). There must be at least one period, and no
24-
// underscores may be present in the last two segments of the domain.
25-
static const domainPart = r'\w\-';
26-
static const domain = '[$domainPart][$domainPart.]+';
27-
28-
// A valid domain consists of alphanumeric characters, underscores (_),
29-
// hyphens (-) and periods (.).
30-
static const path = r'[^\s<]*';
31-
32-
// Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will not
33-
// be considered part of the autolink
34-
static const truncatingPunctuationPositive = '[?!.,:*_~]';
35-
36-
static final regExpTrailingPunc = RegExp('$truncatingPunctuationPositive*\$');
37-
static final regExpEndsWithColon = RegExp(r'\&[a-zA-Z0-9]+;$');
38-
static final regExpWhiteSpace = RegExp(r'\s');
39-
40-
AutolinkExtensionSyntax() : super('$start(($scheme)($domain)($path))');
13+
static const _linkPattern =
14+
// Autolinks can only come at the beginning of a line, after whitespace,
15+
// or any of the delimiting characters *, _, ~, and (.
16+
r'(?<=^|[\s*_~(>])'
17+
18+
// An extended url autolink will be recognised when one of the schemes
19+
// http://, or https://, followed by a valid domain. See
20+
// https://github.github.com/gfm/#extended-url-autolink.
21+
r'(?:(?:https?|ftp):\/\/|www\.)'
22+
23+
// A valid domain consists of segments of alphanumeric characters,
24+
// underscores (_) and hyphens (-) separated by periods (.). There must
25+
// be at least one period, and no underscores may be present in the last
26+
// two segments of the domain. See
27+
// https://github.github.com/gfm/#valid-domain.
28+
r'(?:[-_a-z0-9]+\.)*(?:[-a-z0-9]+\.[-a-z0-9]+)'
29+
30+
// After a valid domain, zero or more non-space non-< characters may
31+
// follow.
32+
r'[^\s<]*'
33+
34+
// Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
35+
// not be considered part of the autolink, though they may be included in
36+
// the interior of the link. See
37+
// https://github.github.com/gfm/#extended-autolink-path-validation.
38+
'(?<![?!.,:*_~])';
39+
40+
// An extended email autolink, see
41+
// https://github.github.com/gfm/#extended-email-autolink.
42+
static const _emailPattern =
43+
r'[-_.+a-z0-9]+@(?:[-_a-z0-9]+\.)+[-_a-z0-9]*[a-z0-9](?![-_])';
44+
45+
AutolinkExtensionSyntax()
46+
: super(
47+
'($_linkPattern)|($_emailPattern)',
48+
caseSensitive: false,
49+
);
4150

4251
@override
4352
bool tryMatch(InlineParser parser, [int? startMatchPos]) {
44-
return super.tryMatch(parser, parser.pos > 0 ? parser.pos - 1 : 0);
53+
startMatchPos ??= parser.pos;
54+
final startMatch = pattern.matchAsPrefix(parser.source, startMatchPos);
55+
if (startMatch == null) {
56+
return false;
57+
}
58+
parser.writeText();
59+
return onMatch(parser, startMatch);
4560
}
4661

4762
@override
4863
bool onMatch(InlineParser parser, Match match) {
49-
var url = match[1]!;
50-
var href = url;
51-
var matchLength = url.length;
52-
53-
if (url[0] == '>' || url.startsWith(regExpWhiteSpace)) {
54-
url = url.substring(1, url.length - 1);
55-
href = href.substring(1, href.length - 1);
56-
parser.pos++;
57-
matchLength--;
58-
}
64+
int consumeLength;
5965

60-
// Prevent accidental standard autolink matches
61-
if (url.endsWith('>') && parser.source[parser.pos - 1] == '<') {
62-
return false;
66+
final isEmailLink = match[2] != null;
67+
if (isEmailLink) {
68+
consumeLength = match.match.length;
69+
} else {
70+
consumeLength = _getConsumeLength(match.match);
6371
}
6472

65-
// When an autolink ends in ), we scan the entire autolink for the total
66-
// number of parentheses. If there is a greater number of closing
67-
// parentheses than opening ones, we don’t consider the last character
68-
// part of the autolink, in order to facilitate including an autolink
69-
// inside a parenthesis:
70-
// https://github.github.com/gfm/#example-600
71-
if (url.endsWith(')')) {
72-
final opening = _countChars(url, '(');
73-
final closing = _countChars(url, ')');
74-
75-
if (closing > opening) {
76-
url = url.substring(0, url.length - 1);
77-
href = href.substring(0, href.length - 1);
78-
matchLength--;
79-
}
80-
}
73+
var text = match.match.substring(0, consumeLength);
74+
text = parser.encodeHtml ? escapeHtml(text) : text;
8175

82-
// Trailing punctuation (specifically, ?, !, ., ,, :, *, _, and ~) will
83-
// not be considered part of the autolink, though they may be included
84-
// in the interior of the link:
85-
// https://github.github.com/gfm/#example-599
86-
final trailingPunc = regExpTrailingPunc.firstMatch(url);
87-
if (trailingPunc != null) {
88-
final trailingLength = trailingPunc.match.length;
89-
url = url.substring(0, url.length - trailingLength);
90-
href = href.substring(0, href.length - trailingLength);
91-
matchLength -= trailingLength;
92-
}
93-
94-
// If an autolink ends in a semicolon (;), we check to see if it appears
95-
// to resemble an
96-
// [entity reference](https://github.github.com/gfm/#entity-references);
97-
// if the preceding text is & followed by one or more alphanumeric
98-
// characters. If so, it is excluded from the autolink:
99-
// https://github.github.com/gfm/#example-602
100-
if (url.endsWith(';')) {
101-
final entityRef = regExpEndsWithColon.firstMatch(url);
102-
if (entityRef != null) {
103-
// Strip out HTML entity reference
104-
final entityRefLength = entityRef.match.length;
105-
url = url.substring(0, url.length - entityRefLength);
106-
href = href.substring(0, href.length - entityRefLength);
107-
matchLength -= entityRefLength;
108-
}
76+
var destination = text;
77+
if (isEmailLink) {
78+
destination = 'mailto:$destination';
79+
} else if (destination[0] == 'w') {
80+
// When there is no scheme specified, insert the scheme `http`.
81+
destination = 'http://$destination';
10982
}
11083

111-
// The scheme http will be inserted automatically
112-
if (!href.startsWith('http://') &&
113-
!href.startsWith('https://') &&
114-
!href.startsWith('ftp://')) {
115-
href = 'http://$href';
116-
}
84+
final anchor = Element.text('a', text)
85+
..attributes['href'] = Uri.encodeFull(destination);
11786

118-
final text = parser.encodeHtml ? escapeHtml(url) : url;
119-
final anchor = Element.text('a', text);
120-
anchor.attributes['href'] = Uri.encodeFull(href);
121-
parser.addNode(anchor);
87+
parser
88+
..addNode(anchor)
89+
..consume(consumeLength);
12290

123-
parser.consume(matchLength);
124-
return false;
91+
return true;
12592
}
12693

127-
int _countChars(String input, String char) {
128-
var count = 0;
129-
130-
for (var i = 0; i < input.length; i++) {
131-
if (input[i] == char) count++;
94+
int _getConsumeLength(String text) {
95+
var excludedLength = 0;
96+
97+
// When an autolink ends in `)`, see
98+
// https://github.github.com/gfm/#example-625.
99+
if (text.endsWith(')')) {
100+
final match = RegExp(r'(\(.*)?(\)+)$').firstMatch(text)!;
101+
102+
if (match[1] == null) {
103+
excludedLength = match[2]!.length;
104+
} else {
105+
var parenCount = 0;
106+
for (var i = 0; i < text.length; i++) {
107+
final char = text.codeUnitAt(i);
108+
if (char == $lparen) {
109+
parenCount++;
110+
} else if (char == $rparen) {
111+
parenCount--;
112+
}
113+
}
114+
if (parenCount < 0) {
115+
excludedLength = parenCount.abs();
116+
}
117+
}
118+
}
119+
// If an autolink ends in a semicolon `;`, see
120+
// https://github.github.com/gfm/#example-627
121+
else if (text.endsWith(';')) {
122+
final match = RegExp(r'&[0-9a-z]+;$').firstMatch(text);
123+
if (match != null) {
124+
excludedLength = match.match.length;
125+
}
132126
}
133127

134-
return count;
128+
return text.length - excludedLength;
135129
}
136130
}

test/gfm/autolinks.unit

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
>>> Autolinks - 616
5858
< http://foo.bar >
5959
<<<
60-
<p>&lt; <a href="http://foo.bar">http://foo.bar</a> &gt;</p>
60+
<p>&lt; http://foo.bar &gt;</p>
6161
>>> Autolinks - 617
6262
<m:abc>
6363
<<<
@@ -69,7 +69,7 @@
6969
>>> Autolinks - 619
7070
http://example.com
7171
<<<
72-
<p><a href="http://example.com">http://example.com</a></p>
72+
<p>http://example.com</p>
7373
>>> Autolinks - 620
7474
foo@bar.example.com
7575
<<<

test/gfm/autolinks_extension.unit

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ www.google.com/search?q=Markup+(business)))
2323
(www.google.com/search?q=Markup+(business)
2424
<<<
2525
<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
26-
<p><a href="http://www.google.com/search?q=Markup+(business))">www.google.com/search?q=Markup+(business))</a>)</p>
2726
<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
28-
<p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
27+
<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
28+
<p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
2929
>>> Autolinks (extension) - 625
3030
www.google.com/search?q=(business))+ok
3131
<<<
@@ -35,7 +35,7 @@ www.google.com/search?q=commonmark&hl=en
3535

3636
www.google.com/search?q=commonmark&hl;
3737
<<<
38-
<p><a href="http://www.google.com/search?q=commonmark&hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
38+
<p><a href="http://www.google.com/search?q=commonmark&amp;hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
3939
<p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&hl;</p>
4040
>>> Autolinks (extension) - 627
4141
www.commonmark.org/he<lp
@@ -54,11 +54,11 @@ Anonymous FTP is available at ftp://foo.bar.baz.
5454
>>> Autolinks (extension) - 629
5555
foo@bar.baz
5656
<<<
57-
<p>foo@bar.baz</p>
57+
<p><a href="mailto:foo@bar.baz">foo@bar.baz</a></p>
5858
>>> Autolinks (extension) - 630
5959
hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.
6060
<<<
61-
<p>hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.</p>
61+
<p>hello@mail+xyz.example isn't valid, but <a href="mailto:hello+xyz@mail.example">hello+xyz@mail.example</a> is.</p>
6262
>>> Autolinks (extension) - 631
6363
a.b-c_d@a.b
6464

@@ -68,7 +68,7 @@ a.b-c_d@a.b-
6868

6969
a.b-c_d@a.b_
7070
<<<
71-
<p>a.b-c_d@a.b</p>
72-
<p>a.b-c_d@a.b.</p>
71+
<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a></p>
72+
<p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a>.</p>
7373
<p>a.b-c_d@a.b-</p>
7474
<p>a.b-c_d@a.b_</p>

test/markdown_test.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ void main() async {
5555
);
5656

5757
testDirectory('common_mark');
58-
testDirectory('gfm', extensionSet: ExtensionSet.gitHubFlavored);
58+
testDirectory('gfm');
5959

6060
group('Corner cases', () {
6161
validateCore('Incorrect Links', '''

test/util.dart

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,40 @@ void testDirectory(String name, {ExtensionSet? extensionSet}) {
1414
for (final dataCase in dataCasesUnder(testDirectory: name)) {
1515
final description =
1616
'${dataCase.directory}/${dataCase.file}.unit ${dataCase.description}';
17+
18+
final inlineSyntaxes = <InlineSyntax>[];
19+
final blockSyntaxes = <BlockSyntax>[];
20+
21+
if (dataCase.file.endsWith('_extension')) {
22+
final extension = dataCase.file.substring(
23+
0,
24+
dataCase.file.lastIndexOf('_extension'),
25+
);
26+
switch (extension) {
27+
case 'autolinks':
28+
inlineSyntaxes.add(AutolinkExtensionSyntax());
29+
break;
30+
case 'strikethrough':
31+
inlineSyntaxes.add(StrikethroughSyntax());
32+
break;
33+
case 'tables':
34+
blockSyntaxes.add(const TableSyntax());
35+
break;
36+
case 'disallowed_raw_html':
37+
// TODO(Zhiguang): https://github.com/dart-lang/markdown/pull/447
38+
break;
39+
default:
40+
throw UnimplementedError('Unimplemented extension "$extension"');
41+
}
42+
}
43+
1744
validateCore(
1845
description,
1946
dataCase.input,
2047
dataCase.expectedOutput,
2148
extensionSet: extensionSet,
49+
inlineSyntaxes: inlineSyntaxes,
50+
blockSyntaxes: blockSyntaxes,
2251
);
2352
}
2453
}

tool/gfm_stats.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,24 @@
3434
"613": "strict",
3535
"614": "strict",
3636
"615": "strict",
37-
"616": "fail",
37+
"616": "strict",
3838
"617": "strict",
3939
"618": "strict",
40-
"619": "fail",
40+
"619": "strict",
4141
"620": "strict"
4242
},
4343
"Autolinks (extension)": {
4444
"621": "strict",
4545
"622": "strict",
4646
"623": "strict",
47-
"624": "fail",
47+
"624": "strict",
4848
"625": "strict",
4949
"626": "loose",
5050
"627": "strict",
5151
"628": "strict",
52-
"629": "fail",
53-
"630": "fail",
54-
"631": "fail"
52+
"629": "strict",
53+
"630": "strict",
54+
"631": "strict"
5555
},
5656
"Backslash escapes": {
5757
"308": "loose",

tool/gfm_stats.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
17 of 18 – 94.4% ATX headings
2-
17 of 19 – 89.5% Autolinks
3-
7 of 11 – 63.6% Autolinks (extension)
2+
19 of 19 – 100.0% Autolinks
3+
11 of 11 – 100.0% Autolinks (extension)
44
12 of 13 – 92.3% Backslash escapes
55
1 of 1 – 100.0% Blank lines
66
23 of 25 – 92.0% Block quotes
@@ -28,5 +28,5 @@
2828
11 of 11 – 100.0% Tabs
2929
3 of 3 – 100.0% Textual content
3030
19 of 19 – 100.0% Thematic breaks
31-
634 of 671 – 94.5% TOTAL
32-
569 of 634 – 89.7% TOTAL Strict
31+
640 of 671 – 95.4% TOTAL
32+
575 of 640 – 89.8% TOTAL Strict

0 commit comments

Comments
 (0)