Skip to content

Commit

Permalink
[VM] Partial support for named regexp captures.
Browse files Browse the repository at this point in the history
See https://github.com/tc39/proposal-regexp-named-groups
for a high-level description of the feature and examples.  This is one of the
features requested in #34935.

This is a partial implementation because while there is a way to retrieve
groups via Dart by name, it requires casting the returned Match to the
new RegExpMatch interface to avoid changing the RegExp interface.
Changing the RegExp interface will happen in a future update, since there
are other planned changes to the RegExp interface coming soon and that way
we only change it once. See #36171
for more details on the planned changes.

Also, since only BMP regular expressions are supported, not full
Unicode ones (i.e., those with the /u flag in ECMAscript), \k<NAME>
will only be parsed as a named back reference if there are named
captures in the string. Otherwise, the \k will be parsed as the identity
escape for backwards compatibility. The new tests illustrate this
difference.

Change-Id: Ieeb0374813db78924c9aa8ac3e652dfb6d4a5934
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/95461
Commit-Queue: Stevie Strickland <sstrickl@google.com>
Reviewed-by: Lasse R.H. Nielsen <lrn@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
Reviewed-by: Jenny Messerly <jmesserly@google.com>
Reviewed-by: Johnni Winther <johnniwinther@google.com>
  • Loading branch information
sstrickl authored and commit-bot@chromium.org committed Mar 19, 2019
1 parent efb3f85 commit af5fc2d
Show file tree
Hide file tree
Showing 21 changed files with 555 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Expand Up @@ -9,6 +9,11 @@
### Dart VM

* RegExp patterns can now use lookbehind assertions.
* RegExp patterns can now use named capture groups and named backreferences.
Currently, named group matches can only be retrieved in Dart either by
the implicit index of the named group or by downcasting the returned Match
object to the type RegExpMatch. The RegExpMatch interface contains methods
for retrieving the available group names and retrieving a match by group name.

### Tool Changes

Expand Down
6 changes: 5 additions & 1 deletion pkg/dev_compiler/tool/input_sdk/private/js_helper.dart
Expand Up @@ -10,7 +10,11 @@ import 'dart:_foreign_helper' show JS, JS_STRING_CONCAT, JSExportName;

import 'dart:_interceptors';
import 'dart:_internal'
show EfficientLengthIterable, MappedIterable, IterableElementError;
show
EfficientLengthIterable,
MappedIterable,
IterableElementError,
SubListIterable;

import 'dart:_native_typed_data';
import 'dart:_runtime' as dart;
Expand Down
22 changes: 21 additions & 1 deletion pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart
Expand Up @@ -159,7 +159,7 @@ class JSSyntaxRegExp implements RegExp {
bool get isCaseSensitive => _isCaseSensitive;
}

class _MatchImplementation implements Match {
class _MatchImplementation implements RegExpMatch {
final Pattern pattern;
// Contains a JS RegExp match object.
// It is an Array of String values with extra "index" and "input" properties.
Expand All @@ -185,6 +185,26 @@ class _MatchImplementation implements Match {
}
return out;
}

String namedGroup(String name) {
var groups = JS('Object', '#.groups', _match);
if (groups != null) {
var result = JS('String|Null', '#[#]', groups, name);
if (result != null || JS('bool', '# in #', name, groups)) {
return result;
}
}
throw ArgumentError.value(name, "name", "Not a capture group name");
}

Iterable<String> get groupNames {
var groups = JS('Object', '#.groups', _match);
if (groups != null) {
var keys = JSArray<String>.of(JS('', 'Object.keys(#)', groups));
return SubListIterable(keys, 0, null);
}
return Iterable.empty();
}
}

class _AllMatchesIterable extends IterableBase<Match> {
Expand Down
16 changes: 16 additions & 0 deletions runtime/lib/regexp.cc
Expand Up @@ -60,6 +60,22 @@ DEFINE_NATIVE_ENTRY(RegExp_getGroupCount, 0, 1) {
return regexp.num_bracket_expressions();
}
const String& pattern = String::Handle(regexp.pattern());
const String& errmsg =
String::Handle(String::New("Regular expression is not initialized yet."));
const String& message = String::Handle(String::Concat(errmsg, pattern));
const Array& args = Array::Handle(Array::New(1));
args.SetAt(0, message);
Exceptions::ThrowByType(Exceptions::kFormat, args);
return Object::null();
}

DEFINE_NATIVE_ENTRY(RegExp_getGroupNameMap, 0, 1) {
const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0));
ASSERT(!regexp.IsNull());
if (regexp.is_initialized()) {
return regexp.capture_name_map();
}
const String& pattern = String::Handle(regexp.pattern());
const String& errmsg = String::Handle(
String::New("Regular expression is not initialized yet. "));
const String& message = String::Handle(String::Concat(errmsg, pattern));
Expand Down
38 changes: 37 additions & 1 deletion runtime/lib/regexp_patch.dart
Expand Up @@ -104,6 +104,8 @@ class RegExp {
new LinkedList<_RegExpHashKey>();

int get _groupCount;
Iterable<String> get _groupNames;
int _groupNameIndex(String name);
}

// Represents both a key in the regular expression cache as well as its
Expand Down Expand Up @@ -133,7 +135,7 @@ class _RegExpHashValue {
_RegExpHashValue(this.regexp, this.key);
}

class _RegExpMatch implements Match {
class _RegExpMatch implements RegExpMatch {
_RegExpMatch(this._regexp, this.input, this._match);

int get start => _start(0);
Expand Down Expand Up @@ -176,6 +178,18 @@ class _RegExpMatch implements Match {

Pattern get pattern => _regexp;

String namedGroup(String name) {
var idx = _regexp._groupNameIndex(name);
if (idx < 0) {
throw ArgumentError("Not a capture group name: ${name}");
}
return group(idx);
}

Iterable<String> get groupNames {
return _regexp._groupNames;
}

final RegExp _regexp;
final String input;
final List<int> _match;
Expand Down Expand Up @@ -240,6 +254,28 @@ class _RegExp implements RegExp {

int get _groupCount native "RegExp_getGroupCount";

// Returns a List [String, int, String, int, ...] where each
// String is the name of a capture group and the following
// int is that capture group's index.
List get _groupNameList native "RegExp_getGroupNameMap";

Iterable<String> get _groupNames sync* {
final nameList = _groupNameList;
for (var i = 0; i < nameList.length; i += 2) {
yield nameList[i] as String;
}
}

int _groupNameIndex(String name) {
var nameList = _groupNameList;
for (var i = 0; i < nameList.length; i += 2) {
if (name == nameList[i]) {
return nameList[i + 1];
}
}
return -1;
}

// Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/bootstrap_natives.h
Expand Up @@ -103,6 +103,7 @@ namespace dart {
V(RegExp_getIsMultiLine, 1) \
V(RegExp_getIsCaseSensitive, 1) \
V(RegExp_getGroupCount, 1) \
V(RegExp_getGroupNameMap, 1) \
V(RegExp_ExecuteMatch, 3) \
V(RegExp_ExecuteMatchSticky, 3) \
V(List_new, 2) \
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/compiler/jit/compiler.cc
Expand Up @@ -174,6 +174,7 @@ void IrregexpCompilationPipeline::ParseFunction(
RegExpParser::ParseRegExp(pattern, multiline, compile_data);

regexp.set_num_bracket_expressions(compile_data->capture_count);
regexp.set_capture_name_map(compile_data->capture_name_map);
if (compile_data->simple) {
regexp.set_is_simple();
} else {
Expand Down
4 changes: 4 additions & 0 deletions runtime/vm/object.cc
Expand Up @@ -21511,6 +21511,10 @@ void RegExp::set_num_bracket_expressions(intptr_t value) const {
StoreSmi(&raw_ptr()->num_bracket_expressions_, Smi::New(value));
}

void RegExp::set_capture_name_map(const Array& array) const {
StorePointer(&raw_ptr()->capture_name_map_, array.raw());
}

RawRegExp* RegExp::New(Heap::Space space) {
RegExp& result = RegExp::Handle();
{
Expand Down
2 changes: 2 additions & 0 deletions runtime/vm/object.h
Expand Up @@ -8990,6 +8990,7 @@ class RegExp : public Instance {
RawSmi* num_bracket_expressions() const {
return raw_ptr()->num_bracket_expressions_;
}
RawArray* capture_name_map() const { return raw_ptr()->capture_name_map_; }

RawTypedData* bytecode(bool is_one_byte, bool sticky) const {
if (sticky) {
Expand Down Expand Up @@ -9046,6 +9047,7 @@ class RegExp : public Instance {
const TypedData& bytecode) const;

void set_num_bracket_expressions(intptr_t value) const;
void set_capture_name_map(const Array& array) const;
void set_is_global() const { set_flags(flags() | kGlobal); }
void set_is_ignore_case() const { set_flags(flags() | kIgnoreCase); }
void set_is_multi_line() const { set_flags(flags() | kMultiLine); }
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/raw_object.h
Expand Up @@ -2347,6 +2347,7 @@ class RawRegExp : public RawInstance {

VISIT_FROM(RawObject*, num_bracket_expressions_)
RawSmi* num_bracket_expressions_;
RawArray* capture_name_map_;
RawString* pattern_; // Pattern to be used for matching.
union {
RawFunction* function_;
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/raw_object_fields.cc
Expand Up @@ -178,6 +178,7 @@ namespace dart {
F(StackTrace, code_array_) \
F(StackTrace, pc_offset_array_) \
F(RegExp, num_bracket_expressions_) \
F(RegExp, capture_name_map_) \
F(RegExp, pattern_) \
F(RegExp, external_one_byte_function_) \
F(RegExp, external_two_byte_function_) \
Expand Down
4 changes: 4 additions & 0 deletions runtime/vm/raw_object_snapshot.cc
Expand Up @@ -2149,8 +2149,12 @@ RawRegExp* RegExp::ReadFrom(SnapshotReader* reader,
// Read and Set all the other fields.
regex.StoreSmi(&regex.raw_ptr()->num_bracket_expressions_,
reader->ReadAsSmi());

*reader->ArrayHandle() ^= reader->ReadObjectImpl(kAsInlinedObject);
regex.set_capture_name_map(*reader->ArrayHandle());
*reader->StringHandle() ^= reader->ReadObjectImpl(kAsInlinedObject);
regex.set_pattern(*reader->StringHandle());

regex.StoreNonPointer(&regex.raw_ptr()->num_registers_,
reader->Read<int32_t>());
regex.StoreNonPointer(&regex.raw_ptr()->type_flags_, reader->Read<int8_t>());
Expand Down
2 changes: 2 additions & 0 deletions runtime/vm/regexp.h
Expand Up @@ -1318,12 +1318,14 @@ struct RegExpCompileData : public ZoneAllocated {
node(NULL),
simple(true),
contains_anchor(false),
capture_name_map(Array::Handle(Array::null())),
error(String::Handle(String::null())),
capture_count(0) {}
RegExpTree* tree;
RegExpNode* node;
bool simple;
bool contains_anchor;
Array& capture_name_map;
String& error;
intptr_t capture_count;
};
Expand Down
1 change: 1 addition & 0 deletions runtime/vm/regexp_assembler_bytecode.cc
Expand Up @@ -441,6 +441,7 @@ static intptr_t Prepare(const RegExp& regexp,
RegExpParser::ParseRegExp(pattern, multiline, compile_data);

regexp.set_num_bracket_expressions(compile_data->capture_count);
regexp.set_capture_name_map(compile_data->capture_name_map);
if (compile_data->simple) {
regexp.set_is_simple();
} else {
Expand Down
14 changes: 12 additions & 2 deletions runtime/vm/regexp_ast.h
Expand Up @@ -277,7 +277,8 @@ class RegExpQuantifier : public RegExpTree {

class RegExpCapture : public RegExpTree {
public:
explicit RegExpCapture(intptr_t index) : body_(nullptr), index_(index) {}
explicit RegExpCapture(intptr_t index)
: body_(nullptr), index_(index), name_(nullptr) {}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
static RegExpNode* ToNode(RegExpTree* body,
Expand All @@ -298,12 +299,15 @@ class RegExpCapture : public RegExpTree {
// capture group is parsed.
void set_body(RegExpTree* body) { body_ = body; }
intptr_t index() const { return index_; }
const ZoneGrowableArray<uint16_t>* name() { return name_; }
void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }
static intptr_t StartRegister(intptr_t index) { return index * 2; }
static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; }

private:
RegExpTree* body_;
intptr_t index_;
const ZoneGrowableArray<uint16_t>* name_;
};

class RegExpLookaround : public RegExpTree {
Expand Down Expand Up @@ -366,7 +370,9 @@ class RegExpLookaround : public RegExpTree {

class RegExpBackReference : public RegExpTree {
public:
explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
explicit RegExpBackReference(RegExpCapture* capture)
: capture_(capture), name_(nullptr) {}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
virtual RegExpBackReference* AsBackReference();
Expand All @@ -378,9 +384,13 @@ class RegExpBackReference : public RegExpTree {
virtual intptr_t max_match() const { return kInfinity; }
intptr_t index() const { return capture_->index(); }
RegExpCapture* capture() const { return capture_; }
void set_capture(RegExpCapture* capture) { capture_ = capture; }
const ZoneGrowableArray<uint16_t>* name() { return name_; }
void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }

private:
RegExpCapture* capture_;
const ZoneGrowableArray<uint16_t>* name_;
};

class RegExpEmpty : public RegExpTree {
Expand Down

0 comments on commit af5fc2d

Please sign in to comment.