diff --git a/CHANGELOG.md b/CHANGELOG.md index d937ee5c0308..a14d7dda644b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ ### Dart VM * RegExp patterns can now use lookbehind assertions. +* RegExp patterns can now use named capture groups and named backreferences. + Currently, named group matches can only be retrieved in Dart either by + the implicit index of the named group or by downcasting the returned Match + object to the type RegExpMatch. The RegExpMatch interface contains methods + for retrieving the available group names and retrieving a match by group name. ### Tool Changes diff --git a/pkg/dev_compiler/tool/input_sdk/private/js_helper.dart b/pkg/dev_compiler/tool/input_sdk/private/js_helper.dart index 8916dcc6645e..cf6b1e65ddcb 100644 --- a/pkg/dev_compiler/tool/input_sdk/private/js_helper.dart +++ b/pkg/dev_compiler/tool/input_sdk/private/js_helper.dart @@ -10,7 +10,11 @@ import 'dart:_foreign_helper' show JS, JS_STRING_CONCAT, JSExportName; import 'dart:_interceptors'; import 'dart:_internal' - show EfficientLengthIterable, MappedIterable, IterableElementError; + show + EfficientLengthIterable, + MappedIterable, + IterableElementError, + SubListIterable; import 'dart:_native_typed_data'; import 'dart:_runtime' as dart; diff --git a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart index 2d6ff70f5665..92066850dd8b 100644 --- a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart +++ b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart @@ -159,7 +159,7 @@ class JSSyntaxRegExp implements RegExp { bool get isCaseSensitive => _isCaseSensitive; } -class _MatchImplementation implements Match { +class _MatchImplementation implements RegExpMatch { final Pattern pattern; // Contains a JS RegExp match object. // It is an Array of String values with extra "index" and "input" properties. @@ -185,6 +185,26 @@ class _MatchImplementation implements Match { } return out; } + + String namedGroup(String name) { + var groups = JS('Object', '#.groups', _match); + if (groups != null) { + var result = JS('String|Null', '#[#]', groups, name); + if (result != null || JS('bool', '# in #', name, groups)) { + return result; + } + } + throw ArgumentError.value(name, "name", "Not a capture group name"); + } + + Iterable get groupNames { + var groups = JS('Object', '#.groups', _match); + if (groups != null) { + var keys = JSArray.of(JS('', 'Object.keys(#)', groups)); + return SubListIterable(keys, 0, null); + } + return Iterable.empty(); + } } class _AllMatchesIterable extends IterableBase { diff --git a/runtime/lib/regexp.cc b/runtime/lib/regexp.cc index aa58c5f3edad..4dba76e6eb7c 100644 --- a/runtime/lib/regexp.cc +++ b/runtime/lib/regexp.cc @@ -60,6 +60,22 @@ DEFINE_NATIVE_ENTRY(RegExp_getGroupCount, 0, 1) { return regexp.num_bracket_expressions(); } const String& pattern = String::Handle(regexp.pattern()); + const String& errmsg = + String::Handle(String::New("Regular expression is not initialized yet.")); + const String& message = String::Handle(String::Concat(errmsg, pattern)); + const Array& args = Array::Handle(Array::New(1)); + args.SetAt(0, message); + Exceptions::ThrowByType(Exceptions::kFormat, args); + return Object::null(); +} + +DEFINE_NATIVE_ENTRY(RegExp_getGroupNameMap, 0, 1) { + const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); + ASSERT(!regexp.IsNull()); + if (regexp.is_initialized()) { + return regexp.capture_name_map(); + } + const String& pattern = String::Handle(regexp.pattern()); const String& errmsg = String::Handle( String::New("Regular expression is not initialized yet. ")); const String& message = String::Handle(String::Concat(errmsg, pattern)); diff --git a/runtime/lib/regexp_patch.dart b/runtime/lib/regexp_patch.dart index de20a7272447..4eac4461187c 100644 --- a/runtime/lib/regexp_patch.dart +++ b/runtime/lib/regexp_patch.dart @@ -104,6 +104,8 @@ class RegExp { new LinkedList<_RegExpHashKey>(); int get _groupCount; + Iterable get _groupNames; + int _groupNameIndex(String name); } // Represents both a key in the regular expression cache as well as its @@ -133,7 +135,7 @@ class _RegExpHashValue { _RegExpHashValue(this.regexp, this.key); } -class _RegExpMatch implements Match { +class _RegExpMatch implements RegExpMatch { _RegExpMatch(this._regexp, this.input, this._match); int get start => _start(0); @@ -176,6 +178,18 @@ class _RegExpMatch implements Match { Pattern get pattern => _regexp; + String namedGroup(String name) { + var idx = _regexp._groupNameIndex(name); + if (idx < 0) { + throw ArgumentError("Not a capture group name: ${name}"); + } + return group(idx); + } + + Iterable get groupNames { + return _regexp._groupNames; + } + final RegExp _regexp; final String input; final List _match; @@ -240,6 +254,28 @@ class _RegExp implements RegExp { int get _groupCount native "RegExp_getGroupCount"; + // Returns a List [String, int, String, int, ...] where each + // String is the name of a capture group and the following + // int is that capture group's index. + List get _groupNameList native "RegExp_getGroupNameMap"; + + Iterable get _groupNames sync* { + final nameList = _groupNameList; + for (var i = 0; i < nameList.length; i += 2) { + yield nameList[i] as String; + } + } + + int _groupNameIndex(String name) { + var nameList = _groupNameList; + for (var i = 0; i < nameList.length; i += 2) { + if (name == nameList[i]) { + return nameList[i + 1]; + } + } + return -1; + } + // Byte map of one byte characters with a 0xff if the character is a word // character (digit, letter or underscore) and 0x00 otherwise. // Used by generated RegExp code. diff --git a/runtime/vm/bootstrap_natives.h b/runtime/vm/bootstrap_natives.h index da9161f91485..5c0b863a6557 100644 --- a/runtime/vm/bootstrap_natives.h +++ b/runtime/vm/bootstrap_natives.h @@ -103,6 +103,7 @@ namespace dart { V(RegExp_getIsMultiLine, 1) \ V(RegExp_getIsCaseSensitive, 1) \ V(RegExp_getGroupCount, 1) \ + V(RegExp_getGroupNameMap, 1) \ V(RegExp_ExecuteMatch, 3) \ V(RegExp_ExecuteMatchSticky, 3) \ V(List_new, 2) \ diff --git a/runtime/vm/compiler/jit/compiler.cc b/runtime/vm/compiler/jit/compiler.cc index db4a50762b55..52dd77c47c39 100644 --- a/runtime/vm/compiler/jit/compiler.cc +++ b/runtime/vm/compiler/jit/compiler.cc @@ -174,6 +174,7 @@ void IrregexpCompilationPipeline::ParseFunction( RegExpParser::ParseRegExp(pattern, multiline, compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); + regexp.set_capture_name_map(compile_data->capture_name_map); if (compile_data->simple) { regexp.set_is_simple(); } else { diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc index 496631e2a1c7..3cae1d462095 100644 --- a/runtime/vm/object.cc +++ b/runtime/vm/object.cc @@ -21511,6 +21511,10 @@ void RegExp::set_num_bracket_expressions(intptr_t value) const { StoreSmi(&raw_ptr()->num_bracket_expressions_, Smi::New(value)); } +void RegExp::set_capture_name_map(const Array& array) const { + StorePointer(&raw_ptr()->capture_name_map_, array.raw()); +} + RawRegExp* RegExp::New(Heap::Space space) { RegExp& result = RegExp::Handle(); { diff --git a/runtime/vm/object.h b/runtime/vm/object.h index 412996336b09..798331adbe8b 100644 --- a/runtime/vm/object.h +++ b/runtime/vm/object.h @@ -8990,6 +8990,7 @@ class RegExp : public Instance { RawSmi* num_bracket_expressions() const { return raw_ptr()->num_bracket_expressions_; } + RawArray* capture_name_map() const { return raw_ptr()->capture_name_map_; } RawTypedData* bytecode(bool is_one_byte, bool sticky) const { if (sticky) { @@ -9046,6 +9047,7 @@ class RegExp : public Instance { const TypedData& bytecode) const; void set_num_bracket_expressions(intptr_t value) const; + void set_capture_name_map(const Array& array) const; void set_is_global() const { set_flags(flags() | kGlobal); } void set_is_ignore_case() const { set_flags(flags() | kIgnoreCase); } void set_is_multi_line() const { set_flags(flags() | kMultiLine); } diff --git a/runtime/vm/raw_object.h b/runtime/vm/raw_object.h index 63927035a0c3..08f3ecfcdf40 100644 --- a/runtime/vm/raw_object.h +++ b/runtime/vm/raw_object.h @@ -2347,6 +2347,7 @@ class RawRegExp : public RawInstance { VISIT_FROM(RawObject*, num_bracket_expressions_) RawSmi* num_bracket_expressions_; + RawArray* capture_name_map_; RawString* pattern_; // Pattern to be used for matching. union { RawFunction* function_; diff --git a/runtime/vm/raw_object_fields.cc b/runtime/vm/raw_object_fields.cc index f1dc7ccc100c..6bb1ac1d060e 100644 --- a/runtime/vm/raw_object_fields.cc +++ b/runtime/vm/raw_object_fields.cc @@ -178,6 +178,7 @@ namespace dart { F(StackTrace, code_array_) \ F(StackTrace, pc_offset_array_) \ F(RegExp, num_bracket_expressions_) \ + F(RegExp, capture_name_map_) \ F(RegExp, pattern_) \ F(RegExp, external_one_byte_function_) \ F(RegExp, external_two_byte_function_) \ diff --git a/runtime/vm/raw_object_snapshot.cc b/runtime/vm/raw_object_snapshot.cc index 58ec79e90bfa..b871f09dcb40 100644 --- a/runtime/vm/raw_object_snapshot.cc +++ b/runtime/vm/raw_object_snapshot.cc @@ -2149,8 +2149,12 @@ RawRegExp* RegExp::ReadFrom(SnapshotReader* reader, // Read and Set all the other fields. regex.StoreSmi(®ex.raw_ptr()->num_bracket_expressions_, reader->ReadAsSmi()); + + *reader->ArrayHandle() ^= reader->ReadObjectImpl(kAsInlinedObject); + regex.set_capture_name_map(*reader->ArrayHandle()); *reader->StringHandle() ^= reader->ReadObjectImpl(kAsInlinedObject); regex.set_pattern(*reader->StringHandle()); + regex.StoreNonPointer(®ex.raw_ptr()->num_registers_, reader->Read()); regex.StoreNonPointer(®ex.raw_ptr()->type_flags_, reader->Read()); diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h index 3a477e951b64..092a06ad4bb7 100644 --- a/runtime/vm/regexp.h +++ b/runtime/vm/regexp.h @@ -1318,12 +1318,14 @@ struct RegExpCompileData : public ZoneAllocated { node(NULL), simple(true), contains_anchor(false), + capture_name_map(Array::Handle(Array::null())), error(String::Handle(String::null())), capture_count(0) {} RegExpTree* tree; RegExpNode* node; bool simple; bool contains_anchor; + Array& capture_name_map; String& error; intptr_t capture_count; }; diff --git a/runtime/vm/regexp_assembler_bytecode.cc b/runtime/vm/regexp_assembler_bytecode.cc index fac8a82a9cae..8288e1b15dfd 100644 --- a/runtime/vm/regexp_assembler_bytecode.cc +++ b/runtime/vm/regexp_assembler_bytecode.cc @@ -441,6 +441,7 @@ static intptr_t Prepare(const RegExp& regexp, RegExpParser::ParseRegExp(pattern, multiline, compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); + regexp.set_capture_name_map(compile_data->capture_name_map); if (compile_data->simple) { regexp.set_is_simple(); } else { diff --git a/runtime/vm/regexp_ast.h b/runtime/vm/regexp_ast.h index 55f2f8c64f5d..17a175e3d2dc 100644 --- a/runtime/vm/regexp_ast.h +++ b/runtime/vm/regexp_ast.h @@ -277,7 +277,8 @@ class RegExpQuantifier : public RegExpTree { class RegExpCapture : public RegExpTree { public: - explicit RegExpCapture(intptr_t index) : body_(nullptr), index_(index) {} + explicit RegExpCapture(intptr_t index) + : body_(nullptr), index_(index), name_(nullptr) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); static RegExpNode* ToNode(RegExpTree* body, @@ -298,12 +299,15 @@ class RegExpCapture : public RegExpTree { // capture group is parsed. void set_body(RegExpTree* body) { body_ = body; } intptr_t index() const { return index_; } + const ZoneGrowableArray* name() { return name_; } + void set_name(const ZoneGrowableArray* name) { name_ = name; } static intptr_t StartRegister(intptr_t index) { return index * 2; } static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; } private: RegExpTree* body_; intptr_t index_; + const ZoneGrowableArray* name_; }; class RegExpLookaround : public RegExpTree { @@ -366,7 +370,9 @@ class RegExpLookaround : public RegExpTree { class RegExpBackReference : public RegExpTree { public: - explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {} + RegExpBackReference() : capture_(nullptr), name_(nullptr) {} + explicit RegExpBackReference(RegExpCapture* capture) + : capture_(capture), name_(nullptr) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpBackReference* AsBackReference(); @@ -378,9 +384,13 @@ class RegExpBackReference : public RegExpTree { virtual intptr_t max_match() const { return kInfinity; } intptr_t index() const { return capture_->index(); } RegExpCapture* capture() const { return capture_; } + void set_capture(RegExpCapture* capture) { capture_ = capture; } + const ZoneGrowableArray* name() { return name_; } + void set_name(const ZoneGrowableArray* name) { name_ = name; } private: RegExpCapture* capture_; + const ZoneGrowableArray* name_; }; class RegExpEmpty : public RegExpTree { diff --git a/runtime/vm/regexp_parser.cc b/runtime/vm/regexp_parser.cc index e08071a74e9f..b70b945ad0dd 100644 --- a/runtime/vm/regexp_parser.cc +++ b/runtime/vm/regexp_parser.cc @@ -196,7 +196,9 @@ bool RegExpBuilder::AddQuantifierToAtom( RegExpParser::RegExpParser(const String& in, String* error, bool multiline) : zone_(Thread::Current()->zone()), - captures_(NULL), + captures_(nullptr), + named_captures_(nullptr), + named_back_references_(nullptr), in_(in), current_(kEndMarker), next_pos_(0), @@ -206,7 +208,8 @@ RegExpParser::RegExpParser(const String& in, String* error, bool multiline) multiline_(multiline), simple_(false), contains_anchor_(false), - is_scanned_for_captures_(false) { + is_scanned_for_captures_(false), + has_named_captures_(false) { Advance(); } @@ -261,6 +264,7 @@ void RegExpParser::ReportError(const char* message) { // Disjunction RegExpTree* RegExpParser::ParsePattern() { RegExpTree* result = ParseDisjunction(); + PatchNamedBackReferences(); ASSERT(!has_more()); // If the result of parsing is a literal string atom, and it has the // same length as the input, then the atom is identical to the input. @@ -282,8 +286,8 @@ RegExpTree* RegExpParser::ParsePattern() { // Atom Quantifier RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, - Z); + RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, + 0, nullptr, Z); RegExpParserState* stored_state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -317,6 +321,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { // Build result of subexpression. if (group_type == CAPTURE) { + if (stored_state->IsNamedCapture()) { + CreateNamedCaptureAtIndex(stored_state->capture_name(), + capture_index); + } RegExpCapture* capture = GetCapture(capture_index); capture->set_body(body); body = capture; @@ -525,6 +533,18 @@ RegExpTree* RegExpParser::ParseDisjunction() { } break; } + case 'k': + // Either an identity escape or a named back-reference. The two + // interpretations are mutually exclusive: '\k' is interpreted as + // an identity escape for non-Unicode patterns without named + // capture groups, and as the beginning of a named back-reference + // in all other cases. + if (HasNamedCaptures()) { + Advance(2); + ParseNamedBackReference(builder, stored_state); + break; + } + FALL_THROUGH; default: // Identity escape. builder->AddCharacter(Next()); @@ -618,6 +638,8 @@ static bool IsSpecialClassEscape(uint32_t c) { RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( RegExpParserState* state) { RegExpLookaround::Type lookaround_type = state->lookaround_type(); + bool is_named_capture = false; + const RegExpCaptureName* capture_name = nullptr; SubexpressionType subexpr_type = CAPTURE; Advance(); if (current() == '?') { @@ -642,11 +664,16 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( Advance(2); lookaround_type = RegExpLookaround::LOOKBEHIND; subexpr_type = POSITIVE_LOOKAROUND; + break; } else if (Next() == '!') { Advance(2); lookaround_type = RegExpLookaround::LOOKBEHIND; subexpr_type = NEGATIVE_LOOKAROUND; + break; } + is_named_capture = true; + has_named_captures_ = true; + Advance(); break; default: ReportError("Invalid group"); @@ -660,10 +687,14 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( UNREACHABLE(); } captures_started_++; + + if (is_named_capture) { + capture_name = ParseCaptureGroupName(); + } } // Store current state and begin new disjunction parsing. return new RegExpParserState(state, subexpr_type, lookaround_type, - captures_started_, Z); + captures_started_, capture_name, Z); } // In order to know whether an escape is a backreference or not we have to scan @@ -698,7 +729,25 @@ void RegExpParser::ScanForCaptures() { break; } case '(': - if (current() != '?') capture_count++; + // At this point we could be in + // * a non-capturing group '(:', + // * a lookbehind assertion '(?<=' '(?= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || + ch == '$'; +} + +inline constexpr bool IsIdentifierPart(uint16_t ch) { + return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9'); +} + +bool IsSameName(const RegExpCaptureName* name1, + const RegExpCaptureName* name2) { + if (name1->length() != name2->length()) return false; + for (intptr_t i = 0; i < name1->length(); i++) { + if (name1->At(i) != name2->At(i)) return false; + } + return true; +} + +} // end namespace + +const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { + auto name = new (Z) RegExpCaptureName(); + + bool at_start = true; + while (true) { + const uint16_t c = current(); + Advance(); + + // The backslash char is misclassified as both ID_Start and ID_Continue. + if (c == '\\') { + ReportError("Invalid capture group name"); + UNREACHABLE(); + } + + if (at_start) { + if (!IsIdentifierStart(c)) { + ReportError("Invalid capture group name"); + UNREACHABLE(); + } + name->Add(c); + at_start = false; + } else { + if (c == '>') { + break; + } else if (IsIdentifierPart(c)) { + name->Add(c); + } else { + ReportError("Invalid capture group name"); + UNREACHABLE(); + } + } + } + + return name; +} + +intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) { + for (const auto& capture : *named_captures_) { + if (IsSameName(name, capture->name())) return capture->index(); + } + return -1; +} + +void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name, + intptr_t index) { + ASSERT(0 < index && index <= captures_started_); + ASSERT(name != nullptr); + + if (named_captures_ == nullptr) { + named_captures_ = new (Z) ZoneGrowableArray(1); + } else { + // Check for duplicates and bail if we find any. Currently O(n^2). + if (GetNamedCaptureIndex(name) >= 0) { + ReportError("Duplicate capture group name"); + UNREACHABLE(); + } + } + + RegExpCapture* capture = GetCapture(index); + ASSERT(capture->name() == nullptr); + + capture->set_name(name); + named_captures_->Add(capture); +} + +bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state) { + // The parser is assumed to be on the '<' in \k. + if (current() != '<') { + ReportError("Invalid named reference"); + UNREACHABLE(); + } + + Advance(); + const RegExpCaptureName* name = ParseCaptureGroupName(); + if (name == nullptr) { + return false; + } + + if (state->IsInsideCaptureGroup(name)) { + builder->AddEmpty(); + } else { + RegExpBackReference* atom = new (Z) RegExpBackReference(); + atom->set_name(name); + + builder->AddAtom(atom); + + if (named_back_references_ == nullptr) { + named_back_references_ = + new (Z) ZoneGrowableArray(1); + } + named_back_references_->Add(atom); + } + + return true; +} + +void RegExpParser::PatchNamedBackReferences() { + if (named_back_references_ == nullptr) return; + + if (named_captures_ == nullptr) { + ReportError("Invalid named capture referenced"); + return; + } + + // Look up and patch the actual capture for each named back reference. + // Currently O(n^2), optimize if necessary. + for (intptr_t i = 0; i < named_back_references_->length(); i++) { + RegExpBackReference* ref = named_back_references_->At(i); + intptr_t index = GetNamedCaptureIndex(ref->name()); + + if (index < 0) { + ReportError("Invalid named capture referenced"); + UNREACHABLE(); + } + ref->set_capture(GetCapture(index)); + } +} + RegExpCapture* RegExpParser::GetCapture(intptr_t index) { // The index for the capture groups are one-based. Its index in the list is // zero-based. @@ -758,6 +948,39 @@ RegExpCapture* RegExpParser::GetCapture(intptr_t index) { return captures_->At(index - 1); } +RawArray* RegExpParser::CreateCaptureNameMap() { + if (named_captures_ == nullptr || named_captures_->is_empty()) { + return Array::null(); + } + + const intptr_t len = named_captures_->length() * 2; + + const Array& array = Array::Handle(Array::New(len)); + + auto& name = String::Handle(); + auto& smi = Smi::Handle(); + for (intptr_t i = 0; i < named_captures_->length(); i++) { + RegExpCapture* capture = named_captures_->At(i); + name = + String::FromUTF16(capture->name()->data(), capture->name()->length()); + smi = Smi::New(capture->index()); + array.SetAt(i * 2, name); + array.SetAt(i * 2 + 1, smi); + } + + return array.raw(); +} + +bool RegExpParser::HasNamedCaptures() { + if (has_named_captures_ || is_scanned_for_captures_) { + return has_named_captures_; + } + + ScanForCaptures(); + ASSERT(is_scanned_for_captures_); + return has_named_captures_; +} + bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) { for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { if (s->group_type() != CAPTURE) continue; @@ -769,6 +992,16 @@ bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) { return false; } +bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( + const RegExpCaptureName* name) { + ASSERT(name != nullptr); + for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { + if (s->capture_name() == nullptr) continue; + if (IsSameName(s->capture_name(), name)) return true; + } + return false; +} + // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1092,6 +1325,7 @@ void RegExpParser::ParseRegExp(const String& input, intptr_t capture_count = parser.captures_started(); result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; result->contains_anchor = parser.contains_anchor(); + result->capture_name_map = parser.CreateCaptureNameMap(); result->capture_count = capture_count; } diff --git a/runtime/vm/regexp_parser.h b/runtime/vm/regexp_parser.h index c4e127b5823c..a2b626b97a6c 100644 --- a/runtime/vm/regexp_parser.h +++ b/runtime/vm/regexp_parser.h @@ -52,6 +52,8 @@ class RegExpBuilder : public ZoneAllocated { #endif }; +using RegExpCaptureName = ZoneGrowableArray; + class RegExpParser : public ValueObject { public: RegExpParser(const String& in, String* error, bool multiline_mode); @@ -117,12 +119,14 @@ class RegExpParser : public ValueObject { SubexpressionType group_type, RegExpLookaround::Type lookaround_type, intptr_t disjunction_capture_index, + const RegExpCaptureName* capture_name, Zone* zone) : previous_state_(previous_state), builder_(new (zone) RegExpBuilder()), group_type_(group_type), lookaround_type_(lookaround_type), - disjunction_capture_index_(disjunction_capture_index) {} + disjunction_capture_index_(disjunction_capture_index), + capture_name_(capture_name) {} // Parser state of containing expression, if any. RegExpParserState* previous_state() { return previous_state_; } bool IsSubexpression() { return previous_state_ != NULL; } @@ -136,9 +140,14 @@ class RegExpParser : public ValueObject { // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. intptr_t capture_index() { return disjunction_capture_index_; } + const RegExpCaptureName* capture_name() const { return capture_name_; } + + bool IsNamedCapture() const { return capture_name_ != nullptr; } // Check whether the parser is inside a capture group with the given index. bool IsInsideCaptureGroup(intptr_t index); + // Check whether the parser is inside a capture group with the given name. + bool IsInsideCaptureGroup(const RegExpCaptureName* name); private: // Linked list implementation of stack of states. @@ -151,12 +160,37 @@ class RegExpParser : public ValueObject { const RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). intptr_t disjunction_capture_index_; + // Stored capture name (if any). + const RegExpCaptureName* const capture_name_; }; // Return the 1-indexed RegExpCapture object, allocate if necessary. RegExpCapture* GetCapture(intptr_t index); + // Creates a new named capture at the specified index. Must be called exactly + // once for each named capture. Fails if a capture with the same name is + // encountered. + void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index); + + // Parses the name of a capture group (?pattern). The name must adhere + // to IdentifierName in the ECMAScript standard. + const RegExpCaptureName* ParseCaptureGroupName(); + + bool ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state); RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); + intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name); + + // After the initial parsing pass, patch corresponding RegExpCapture objects + // into all RegExpBackReferences. This is done after initial parsing in order + // to avoid complicating cases in which references come before the capture. + void PatchNamedBackReferences(); + + RawArray* CreateCaptureNameMap(); + + // Returns true iff the pattern contains named captures. May call + // ScanForCaptures to look ahead at the remaining pattern. + bool HasNamedCaptures(); Zone* zone() { return zone_; } @@ -169,6 +203,8 @@ class RegExpParser : public ValueObject { Zone* zone_; ZoneGrowableArray* captures_; + ZoneGrowableArray* named_captures_; + ZoneGrowableArray* named_back_references_; const String& in_; uint32_t current_; intptr_t next_pos_; @@ -180,6 +216,7 @@ class RegExpParser : public ValueObject { bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; + bool has_named_captures_; }; } // namespace dart diff --git a/sdk/lib/_internal/js_runtime/lib/js_helper.dart b/sdk/lib/_internal/js_runtime/lib/js_helper.dart index d60398c37c3c..9ad339af0336 100644 --- a/sdk/lib/_internal/js_runtime/lib/js_helper.dart +++ b/sdk/lib/_internal/js_runtime/lib/js_helper.dart @@ -45,7 +45,11 @@ import 'dart:_foreign_helper' import 'dart:_interceptors'; import 'dart:_internal' as _symbol_dev; import 'dart:_internal' - show EfficientLengthIterable, MappedIterable, IterableElementError; + show + EfficientLengthIterable, + MappedIterable, + IterableElementError, + SubListIterable; import 'dart:_native_typed_data'; diff --git a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart index 77f746b4e980..51b4bfbe143a 100644 --- a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart +++ b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart @@ -155,10 +155,13 @@ class JSSyntaxRegExp implements RegExp { bool get isCaseSensitive => _isCaseSensitive; } -class _MatchImplementation implements Match { +class _MatchImplementation implements RegExpMatch { final Pattern pattern; // Contains a JS RegExp match object. // It is an Array of String values with extra 'index' and 'input' properties. + // If there were named capture groups, there will also be an extra 'groups' + // property containing an object with capture group names as keys and + // matched strings as values. // We didn't force it to be JSArray, so it is JSArray, but // containing String or `undefined` values. final JSArray _match; @@ -193,6 +196,27 @@ class _MatchImplementation implements Match { } return out; } + + String namedGroup(String name) { + var groups = JS('Object', '#.groups', _match); + if (groups != null) { + var result = JS('String|Null', '#[#]', groups, name); + if (result != null || JS('bool', '# in #', name, groups)) { + return result; + } + } + throw ArgumentError.value(name, "name", "Not a capture group name"); + } + + Iterable get groupNames { + var groups = JS('Object', '#.groups', _match); + if (groups != null) { + var keys = new JSArray.markGrowable( + JS('returns:JSExtendableArray;new:true', 'Object.keys(#)', groups)); + return SubListIterable(keys, 0, null); + } + return Iterable.empty(); + } } class _AllMatchesIterable extends IterableBase { diff --git a/sdk/lib/core/regexp.dart b/sdk/lib/core/regexp.dart index 9164fb90c439..757bb81d0cae 100644 --- a/sdk/lib/core/regexp.dart +++ b/sdk/lib/core/regexp.dart @@ -121,3 +121,30 @@ abstract class RegExp implements Pattern { */ bool get isCaseSensitive; } + +/** + * A regular expression match. + * + * Regular expression matches are [Match]es, but also include the ability + * to retrieve the names for any named capture groups and to retrieve + * matches for named capture groups by name instead of their index. + */ +abstract class RegExpMatch implements Match { + /** + * The string matched by the group named [name]. + * + * Returns the string matched by the capture group named [name], or + * `null` if no string was matched by that capture group as part of + * this match. + * + * The [name] must be the name of a named capture group in the regular + * expression creating this match (that is, the name must be in + * [groupNames]). + */ + String namedGroup(String name); + + /** + * The names of the captured groups in the match. + */ + Iterable get groupNames; +} diff --git a/tests/corelib_2/regexp/named-captures_test.dart b/tests/corelib_2/regexp/named-captures_test.dart new file mode 100644 index 000000000000..aff61b82d8bf --- /dev/null +++ b/tests/corelib_2/regexp/named-captures_test.dart @@ -0,0 +1,107 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2017 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + void testRE(RegExp re, String input, bool expectedResult) { + if (expectedResult) { + assertTrue(re.hasMatch(input)); + } else { + assertFalse(re.hasMatch(input)); + } + } + + void execRE(RegExp re, String input, List expectedResult) { + assertTrue(re.hasMatch(input)); + shouldBe(re.firstMatch(input), expectedResult); + } + + void namedRE(RegExp re, String input, Map expectedResults) { + assertTrue(re.hasMatch(input)); + var match = re.firstMatch(input) as RegExpMatch; + for (var s in expectedResults.keys) { + assertEquals(match.namedGroup(s), expectedResults[s]); + } + } + + void hasNames(RegExp re, String input, List expectedResults) { + assertTrue(re.hasMatch(input)); + var match = re.firstMatch(input) as RegExpMatch; + for (var s in match.groupNames) { + assertTrue(expectedResults.contains(s)); + } + } + + // Behavior in non-unicode mode. + assertThrows(() => RegExp(r"(?<>a)")); + assertThrows(() => RegExp(r"(? RegExp(r"(?<42a>a)")); + assertThrows(() => RegExp(r"(?<:a>a)")); + assertThrows(() => RegExp(r"(?a)")); + assertThrows(() => RegExp(r"(?a)(?a)")); + assertThrows(() => RegExp(r"(?a)(?b)(?a)")); + assertTrue(RegExp(r"\k").hasMatch("k")); + assertTrue(RegExp(r"\k<4>").hasMatch("k<4>")); + assertTrue(RegExp(r"\k RegExp(r"(?.)\k")); + assertThrows(() => RegExp(r"(?.)\k RegExp(r"(?.)\k")); + assertThrows(() => RegExp(r"(?a)\k")); + assertThrows(() => RegExp(r"(?a)\k")); + assertThrows(() => RegExp(r"\k(?a)")); + assertThrows(() => RegExp(r"\ka)")); + assertTrue(RegExp(r"(?\a)").hasMatch("a")); + + var re = RegExp(r"\k"); + execRE(re, "xxxkxxx", ["k"]); + + re = RegExp(r"\kxxx", ["k.)(?.)(?.)\k\k\k"); + execRE(re, "abccba", ["abccba", "a", "b", "c"]); + namedRE(re, "abccba", {"a": "a", "b": "b", "c": "c"}); + hasNames(re, "abccba", ["a", "b", "c"]); + + // A couple of corner cases around '\k' as named back-references vs. identity + // escapes. + assertTrue(RegExp(r"\k(?<=>)a").hasMatch("ka")); + assertTrue(RegExp(r"\k(?a")); + assertTrue(RegExp(r"\k(x)").hasMatch("kx")); + assertTrue(RegExp(r"\k(?x)").hasMatch("x")); + assertThrows(() => RegExp(r"\k(?x)")); + assertThrows(() => RegExp(r"\k.)")); + assertThrows(() => RegExp(r"\k(?.)")); + + // TODO(sstrickl): Add more tests when unicode flag support is in. + // https://github.com/dart-lang/sdk/issues/36170 +}