-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
regexp.dart
488 lines (473 loc) · 19.7 KB
/
regexp.dart
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
part of dart.core;
/// A regular expression pattern.
///
/// Regular expressions (abbreviated as regex or regexp)
/// consist of a sequence of characters that specify
/// a match-checking algorithm for text _inputs_.
/// Applying a regexp to an input text results either in the regexp matching,
/// or accepting, the text, or the text being rejected.
/// When the regexp matches the text, it further provides some information
/// about *how* it matched the text.
///
/// Dart regular expressions have the same syntax and semantics as
/// JavaScript regular expressions.
/// To learn more about JavaScript regular expressions, see
/// <https://ecma-international.org/ecma-262/9.0/#sec-regexp-regular-expression-objects>.
///
/// Dart provides the basic regexp matching algorithm as [matchAsPrefix],
/// which checks if the regexp matches a part of the input starting
/// at a specific position.
/// If the regexp matches, Dart returns the details of the match as a
/// [RegExpMatch].
///
/// You can build all the other methods of [RegExp] from that basic match
/// check.
///
/// The most common use of a regexp is to *search* for a match in the input.
/// The [firstMatch] method provides this functionality.
/// This method searches a string for the first position where the regexp
/// matches.
/// Again, if a match is found, Dart returns its details as a [RegExpMatch].
///
/// The following example finds the first match of a regular expression in
/// a string.
/// ```dart
/// RegExp exp = RegExp(r'(\w+)');
/// String str = 'Parse my string';
/// RegExpMatch? match = exp.firstMatch(str);
/// print(match![0]); // "Parse"
/// ```
/// Use [allMatches] to look for all matches of a regular expression in
/// a string.
///
/// The following example finds all matches of a regular expression in
/// a string.
/// ```dart
/// RegExp exp = RegExp(r'(\w+)');
/// String str = 'Parse my string';
/// Iterable<RegExpMatch> matches = exp.allMatches(str);
/// for (final m in matches) {
/// print(m[0]);
/// }
/// ```
/// The output of the example is:
/// ```
/// Parse
/// my
/// string
/// ```
///
/// The preceding examples use a _raw string_,
/// a specific string type that prefixes the string literal with `r`.
/// Use a raw string to treat each character, including `\` and `$`,
/// in a string as a literal character. Each character then gets passed
/// to the [RegExp] parser.
/// You should use a raw string as the argument to the
/// [RegExp] constructor.
///
/// **Performance Notice**: Regular expressions do not resolve issues
/// magically.
/// Anyone can write a regexp that performs inefficiently when
/// applied to some string inputs.
/// Often, such a regexp will perform well enough on small or common
/// inputs, but have pathological performance on large and uncommon inputs.
/// This inconsistent behavior makes performance issues harder to detect
/// in testing.
///
/// A regexp might not find text any faster than using `String` operations
/// to inspect a string.
/// The strength of regexp comes from the ability to specify
/// *somewhat* complicated patterns in very few characters.
/// These regexps provide reasonable efficiency in most common cases.
/// This conciseness comes at a cost of readability.
/// Due to their syntactic complexity, regexes cannot be considered
/// self documenting.
///
/// Dart regexps implement the ECMAScript RegExp specification.
/// This specification provides a both common and well-known regexp behavior.
/// When compiling Dart for the web, the compiled code can use the browser’s
/// regexp implementation.
///
/// The specification defines ECMAScript regexp behavior using *backtracking*.
/// When a regexp can choose between different ways to match,
/// it tries each way in the order given in the pattern.
/// For example: `RegExp(r"(foo|bar)baz")` wants to check for `foo` or `bar`,
/// so it checks for `foo` first.
/// If continuing along that path doesn't match the input,
/// the regexp implementation *backtracks*.
/// The implementation resets to the original state from before
/// checking for `foo`,
/// forgetting all the work it has done after that,
/// and then tries the next choice; `bar` in this example.
///
/// The specification defines these choices
/// and the order in which they must be attempted.
/// If a regexp could match an input in more than one way,
/// the order of the choices decides which match the regexp returns.
/// Commonly used regexps order their matching choices to ensure
/// a specific result.
/// The ECMAScript regexp specification limits how Dart
/// can implement regular expressions.
/// It must be a backtracking implementation which checks choices in
/// a specific order.
/// Dart cannot choose a different regexp implementation,
/// because then regexp matching would behave differently.
///
/// The backtracking approach works, but at a cost.
/// For some regexps and some inputs,
/// finding a *correct* match can take a *lot* of tries.
/// It can take even more tries to reject an input
/// that the regexp *almost* matches.
///
/// A well-known dangerous regexp pattern comes from
/// nesting quantifiers like `*`:
///
/// ```dart
/// var re = RegExp(r"^(a*|b)*c");
/// print(re.hasMatch("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
/// ```
///
/// The regexp pattern doesn't match the input string of only `a`s
/// as the input doesn’t contain the required `c`.
/// There exists an _exponential_ number of different ways for `(a*|b)*`
/// to match all the `a`s.
/// The backtracking regexp implementation tries *all* of them
/// before deciding that none of those can lead to a complete match.
/// Each extra `a` added to the input doubles the time the regexp
/// takes to return `false`.
/// (When backtracking has this exponential potential, it is called
/// [“catastrophic backtracking”](https://www.google.com/search?q=regexp+catastrophic+backtracking)).
///
/// Sequential quantifiers provide another dangerous pattern,
/// but they provide “only” polynomial complexity.
///
/// ```dart
/// // Like `\w*-\d`, but check for `b` and `c` in that order.
/// var re = RegExp(r"^\w*(b)?\w*(c)?\w*-\d");
/// print(re.hasMatch("a" * 512));
/// ```
///
/// Again the input doesn’t match, but `RegExp` must try *n*<sup>3</sup> ways
/// to match the *n* `a`s _before_ deciding that.
/// Doubling the input’s length increases the time to return `false`
/// _eightfold_.
/// This exponent increases with the number of sequential quantifiers.
///
/// Both of these patterns look trivial when reduced to such simple regexps.
/// However, these "trivial" patterns often arise as parts of
/// more complicated regular expressions,
/// where your ability to find the problem gets more difficult.
///
/// In general, if a regexp has potential for *super-linear complexity*,
/// you can craft an input that takes an inordinate amount of time to search.
/// These patterns can then be used for [denial of service attacks](https://en.wikipedia.org/wiki/ReDoS)
/// if you apply vulnerable regexp patterns to user-provided inputs.
///
/// No guaranteed solution exists for this problem.
/// Be careful to not use regexps with super-linear behavior
/// where the program may match that regexp against inputs
/// with no guaranteed match.
///
/// Rules of thumb to avoid regexps with super-linear execution time include:
///
/// * Whenever the regexp has a choice, try to make sure
/// that the choice can be made based on the next character
/// (or very limited look-ahead).
/// This limits the need to perform a lot of computation along both choices.
/// * When using quantifiers, ensure that the same string cannot match
/// both one and more-than-one iteration of the quantifier's
/// regular expression.
/// (For `(a*|b)*`, the string `"aa"` can match both
/// `(a*|b){1}` and `(a*|b){2}`.)
/// * Most uses of Dart regular expressions *search* for a match,
/// for example using [firstMatch].
/// If you do not *anchor* the pattern
/// to the start of a line or input using `^`,
/// this search acts as if the regexp began with an implicit `[^]*`.
/// Starting your actual regular expression with `.*`
/// then results in potential quadratic behavior for the search.
/// Use anchors or [matchAsPrefix] where appropriate,
/// or avoid starting the regexp with a quantified pattern.
/// * *For experts only:* Neither Dart nor ECMAScript have general
/// [“atomic grouping”](https://github.com/tc39/proposal-regexp-atomic-operators).
/// Other regular expression dialects use this to limit backtracking.
/// If an atomic capture group succeeds once,
/// the regexp cannot backtrack into the same match later.
/// As lookarounds also serve as atomic groups,
/// something similar can be achieved using a *lookahead*:
/// `var re = RegExp(r"^(?=((a*|b)*))\1d");`
/// The preceding example does the same inefficient matching of `(a*|b)*`.
/// Once the regexp has matched as far as possible,
/// it completes the positive lookahead.
/// Then it skips what the lookahead matched using a back-reference.
/// After that, it can no longer backtrack
/// and try other combinations of `a`s.
///
/// Try to reduce how many ways the regexp can match the same string.
/// That reduces the number of possible backtracks performed
/// when the regexp does not find a match.
/// Several guides to [improving the performance of regular expressions](https://www.google.com/search?q=performance+of+regular+expressions)
/// exist on the internet. Use these as inspirations, too.
abstract interface class RegExp implements Pattern {
/// Constructs a regular expression.
///
/// Throws a [FormatException] if [source] does not follow valid regular
/// expression syntax.
///
/// If your code enables `multiLine`, then `^` and `$` will match
/// the beginning and end of a _line_, as well as matching beginning and
/// end of the input, respectively.
///
/// If your code disables `caseSensitive`,
/// then Dart ignores the case of letters when matching.
/// For example, with `caseSensitive` disable, the regexp pattern `a`
/// matches both `a` and `A`.
///
/// If your code enables `unicode`, then Dart treats the pattern as a
/// Unicode pattern per the ECMAScript standard.
///
/// If your code enables `dotAll`, then the `.` pattern will match _all_
/// characters, including line terminators.
///
/// Example:
///
/// ```dart
/// final wordPattern = RegExp(r'(\w+)');
/// final digitPattern = RegExp(r'(\d+)');
/// ```
///
/// These examples use a _raw string_ as the argument.
/// You should prefer to use a raw string as argument to the [RegExp]
/// constructor, because it makes it easy to write
/// the `\` and `$` characters as regexp reserved characters.
///
/// The same examples written using non-raw strings would be:
/// ```dart
/// final wordPattern = RegExp('(\\w+)'); // Should be raw string.
/// final digitPattern = RegExp('(\\d+)'); // Should be raw string.
/// ```
/// Use a non-raw string only when you need to use
/// string interpolation. For example:
/// ```dart
/// Pattern keyValuePattern(String keyIdentifier) =>
/// RegExp('$keyIdentifier=(\\w+)');
/// ```
/// When including a string verbatim into the regexp pattern like this,
/// be careful that the string does not contain regular expression
/// reserved characters.
/// If that risk exists, use the [escape] function to convert those
/// characters to safe versions of the reserved characters
/// and match only the string itself:
/// ```dart
/// Pattern keyValuePattern(String anyStringKey) =>
/// RegExp('${RegExp.escape(anyStringKey)}=(\\w+)');
/// ```
external factory RegExp(String source,
{bool multiLine = false,
bool caseSensitive = true,
@Since("2.4") bool unicode = false,
@Since("2.4") bool dotAll = false});
/// Creates regular expression syntax that matches the input [text].
///
/// If [text] contains regular expression reserved characters,
/// the resulting regular expression matches those characters literally.
/// If [text] contains no regular expression reserved characters,
/// Dart returns the expression unmodified.
///
/// The reserved characters in regular expressions are:
/// `(`, `)`, `[`, `]`, `{`, `}`, `*`, `+`, `?`, `.`, `^`, `$`, `|` and `\`.
///
/// Use this method to create a pattern to be included in a
/// larger regular expression. Since a [String] is itself a [Pattern]
/// which matches itself, converting the string to a regular expression
/// isn't needed to search for that exact string.
/// ```dart
/// print(RegExp.escape('dash@example.com')); // dash@example\.com
/// print(RegExp.escape('a+b')); // a\+b
/// print(RegExp.escape('a*b')); // a\*b
/// print(RegExp.escape('{a-b}')); // \{a-b\}
/// print(RegExp.escape('a?')); // a\?
/// ```
external static String escape(String text);
/// Finds the first match of the regular expression in the string [input].
///
/// Returns `null` if there is no match.
/// ```dart
/// final string = '[00:13.37] This is a chat message.';
/// final regExp = RegExp(r'c\w*');
/// final match = regExp.firstMatch(string)!;
/// print(match[0]); // chat
/// ```
RegExpMatch? firstMatch(String input);
Iterable<RegExpMatch> allMatches(String input, [int start = 0]);
/// Checks whether this regular expression has a match in the [input].
///
/// ```dart
/// var string = 'Dash is a bird';
/// var regExp = RegExp(r'(humming)?bird');
/// var match = regExp.hasMatch(string); // true
///
/// regExp = RegExp(r'dog');
/// match = regExp.hasMatch(string); // false
/// ```
bool hasMatch(String input);
/// Finds the string of the first match of this regular expression
/// in [input].
///
/// Searches for a match for this regular expression in [input],
/// just like [firstMatch],
/// but returns only the matched substring if a match is found,
/// not a [RegExpMatch].
///
/// ```dart
/// var string = 'Dash is a bird';
/// var regExp = RegExp(r'(humming)?bird');
/// var match = regExp.stringMatch(string); // Match
///
/// regExp = RegExp(r'dog');
/// match = regExp.stringMatch(string); // No match
/// ```
String? stringMatch(String input);
/// The regular expression pattern source of this `RegExp`.
///
/// ```dart
/// final regExp = RegExp(r'\p{L}');
/// print(regExp.pattern); // \p{L}
/// ```
String get pattern;
/// Whether this regular expression matches multiple lines.
///
/// If the regexp does match multiple lines, the "^" and "$" characters
/// match the beginning and end of lines. If not, the characters match the
/// beginning and end of the input.
bool get isMultiLine;
/// Whether this regular expression is case sensitive.
///
/// If the regular expression is not case sensitive, it will match an input
/// letter with a pattern letter even if the two letters are different case
/// versions of the same letter.
/// ```dart
/// final text = 'Parse my string';
/// var regExp = RegExp(r'STRING', caseSensitive: false);
/// print(regExp.isCaseSensitive); // false
/// print(regExp.hasMatch(text)); // true, matches.
///
/// regExp = RegExp(r'STRING', caseSensitive: true);
/// print(regExp.isCaseSensitive); // true
/// print(regExp.hasMatch(text)); // false, no match.
/// ```
bool get isCaseSensitive;
/// Whether this regular expression uses Unicode mode.
///
/// In Unicode mode, Dart treats UTF-16 surrogate pairs in the original
/// string as a single code point and will not match each code unit in the
/// pair separately. Otherwise,
/// Dart treats the target string as a sequence of individual code
/// units and does not treat surrogates as special.
///
/// In Unicode mode, Dart restricts the syntax of the RegExp pattern,
/// for example disallowing some unescaped uses
/// of restricted regexp characters,
/// and disallowing unnecessary `\`-escapes ("identity escapes"),
/// which have both historically been allowed in non-Unicode mode.
/// Dart also allows some pattern features, like Unicode property escapes,
/// only in this mode.
/// ```dart
/// var regExp = RegExp(r'^\p{L}$', unicode: true);
/// print(regExp.hasMatch('a')); // true
/// print(regExp.hasMatch('b')); // true
/// print(regExp.hasMatch('?')); // false
/// print(regExp.hasMatch(r'p{L}')); // false
///
/// // U+1F600 (😀), one code point, two code units.
/// var smiley = '\ud83d\ude00';
///
/// regExp = RegExp(r'^.$', unicode: true); // Matches one code point.
/// print(regExp.hasMatch(smiley)); // true
/// regExp = RegExp(r'^..$', unicode: true); // Matches two code points.
/// print(regExp.hasMatch(smiley)); // false
///
/// regExp = RegExp(r'^\p{L}$', unicode: false);
/// print(regExp.hasMatch('a')); // false
/// print(regExp.hasMatch('b')); // false
/// print(regExp.hasMatch('?')); // false
/// print(regExp.hasMatch(r'p{L}')); // true
///
/// regExp = RegExp(r'^.$', unicode: false); // Matches one code unit.
/// print(regExp.hasMatch(smiley)); // false
/// regExp = RegExp(r'^..$', unicode: false); // Matches two code units.
/// print(regExp.hasMatch(smiley)); // true
/// ```
@Since("2.4")
bool get isUnicode;
/// Whether "." in this regular expression matches line terminators.
///
/// When false, the "." character matches a single character, unless that
/// character terminates a line. When true, then the "." character will
/// match any single character including line terminators.
///
/// This feature is distinct from [isMultiLine]. They affect the behavior
/// of different pattern characters, so they can be used together or
/// separately.
@Since("2.4")
bool get isDotAll;
}
/// A regular expression match.
///
/// Regular expression matches are [Match]es. They also include the ability
/// to retrieve the names for any named capture groups and to retrieve
/// matches for named capture groups by name instead of by their index.
///
/// Example:
/// ```dart
/// const pattern =
/// r'^\[(?<Time>\s*((?<hour>\d+)):((?<minute>\d+))\.((?<second>\d+)))\]'
/// r'\s(?<Message>\s*(.*)$)';
///
/// final regExp = RegExp(
/// pattern,
/// multiLine: true,
/// );
///
/// const multilineText = '[00:13.37] This is a first message.\n'
/// '[01:15.57] This is a second message.\n';
///
/// RegExpMatch regExpMatch = regExp.firstMatch(multilineText)!;
/// print(regExpMatch.groupNames.join('-')); // hour-minute-second-Time-Message.
/// final time = regExpMatch.namedGroup('Time'); // 00:13.37
/// final hour = regExpMatch.namedGroup('hour'); // 00
/// final minute = regExpMatch.namedGroup('minute'); // 13
/// final second = regExpMatch.namedGroup('second'); // 37
/// final message =
/// regExpMatch.namedGroup('Message'); // This is the first message.
/// final date = regExpMatch.namedGroup('Date'); // Undefined `Date`, throws.
///
/// Iterable<RegExpMatch> matches = regExp.allMatches(multilineText);
/// for (final m in matches) {
/// print(m.namedGroup('Time'));
/// print(m.namedGroup('Message'));
/// // 00:13.37
/// // This is the first message.
/// // 01:15.57
/// // This is the second message.
/// }
/// ```
@Since("2.3")
abstract interface class RegExpMatch implements Match {
/// The string captured by the named capture group [name].
///
/// Returns the substring of the input that the
/// labeled capture group, labeled [name], matched,
/// or `null` if that capture group was not part of the match.
///
/// The [name] must be the name of a named capture group in the regular
/// expression [pattern] which created this match.
/// That is, the name must be in [groupNames].
String? namedGroup(String name);
/// The names of the named capture groups of [pattern].
Iterable<String> get groupNames;
RegExp get pattern;
}