diff --git a/src/pkg/regexp/all_test.go b/src/pkg/regexp/all_test.go index 107dfe37cc7fe..f7b41a67416ba 100644 --- a/src/pkg/regexp/all_test.go +++ b/src/pkg/regexp/all_test.go @@ -176,6 +176,45 @@ var replaceTests = []ReplaceTest{ {"[a-c]*", "x", "def", "xdxexfx"}, {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, + + // Substitutions + {"a+", "($0)", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})", "banana", "b(a)n(a)n(a)"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"a+", "(${0})$0", "banana", "b(a)an(a)an(a)a"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, world"}, + {"hello, (.+)", "goodbye, $1x", "hello, world", "goodbye, "}, + {"hello, (.+)", "goodbye, ${1}x", "hello, world", "goodbye, worldx"}, + {"hello, (.+)", "<$0><$1><$2><$3>", "hello, world", "<><>"}, + {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, world!"}, + {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, world"}, + {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "hihihi"}, + {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "byebyebye"}, + {"(?Phi)|(?Pbye)", "$xyz", "hi", ""}, + {"(?Phi)|(?Pbye)", "${x}yz", "hi", "hiyz"}, + {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$"}, + {"a+", "$", "aaa", "$"}, +} + +var replaceLiteralTests = []ReplaceTest{ + // Substitutions + {"a+", "($0)", "banana", "b($0)n($0)n($0)"}, + {"a+", "(${0})", "banana", "b(${0})n(${0})n(${0})"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"a+", "(${0})$0", "banana", "b(${0})$0n(${0})$0n(${0})$0"}, + {"hello, (.+)", "goodbye, ${1}", "hello, world", "goodbye, ${1}"}, + {"hello, (?P.+)", "goodbye, $noun!", "hello, world", "goodbye, $noun!"}, + {"hello, (?P.+)", "goodbye, ${noun}", "hello, world", "goodbye, ${noun}"}, + {"(?Phi)|(?Pbye)", "$x$x$x", "hi", "$x$x$x"}, + {"(?Phi)|(?Pbye)", "$x$x$x", "bye", "$x$x$x"}, + {"(?Phi)|(?Pbye)", "$xyz", "hi", "$xyz"}, + {"(?Phi)|(?Pbye)", "${x}yz", "hi", "${x}yz"}, + {"(?Phi)|(?Pbye)", "hello $$x", "hi", "hello $$x"}, + {"a+", "${oops", "aaa", "${oops"}, + {"a+", "$$", "aaa", "$$"}, + {"a+", "$", "aaa", "$"}, } type ReplaceFuncTest struct { @@ -199,13 +238,58 @@ func TestReplaceAll(t *testing.T) { } actual := re.ReplaceAllString(tc.input, tc.replacement) if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", + t.Errorf("%q.ReplaceAllString(%q,%q) = %q; want %q", tc.pattern, tc.input, tc.replacement, actual, tc.output) } // now try bytes actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) if actual != tc.output { - t.Errorf("%q.Replace(%q,%q) = %q; want %q", + t.Errorf("%q.ReplaceAll(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllLiteral(t *testing.T) { + // Run ReplaceAll tests that do not have $ expansions. + for _, tc := range replaceTests { + if strings.Contains(tc.replacement, "$") { + continue + } + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } + + // Run literal-specific tests. + for _, tc := range replaceLiteralTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllLiteralString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteralString(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllLiteral([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.ReplaceAllLiteral(%q,%q) = %q; want %q", tc.pattern, tc.input, tc.replacement, actual, tc.output) } } diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 7aebd3728a3dc..28c903e7b30e6 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -61,6 +61,7 @@ import ( "strconv" "strings" "sync" + "unicode" "unicode/utf8" ) @@ -416,41 +417,79 @@ func Match(pattern string, b []byte) (matched bool, error error) { return re.Match(b), nil } -// ReplaceAllString returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement string. +// ReplaceAllString returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAllString(src, repl string) string { - return re.ReplaceAllStringFunc(src, func(string) string { return repl }) + n := 2 + if strings.Index(repl, "$") >= 0 { + n = 2 * (re.numSubexp + 1) + } + b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { + return re.expand(dst, repl, nil, src, match) + }) + return string(b) } -// ReplaceAllStringFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched string). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. +// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { + return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + })) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched substring. The replacement returned by repl is substituted +// directly, without using Expand. func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) + return string(b) +} + +func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { lastMatchEnd := 0 // end position of the most recent match searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, nil, src, searchPos, 2) + var buf []byte + var endPos int + if bsrc != nil { + endPos = len(bsrc) + } else { + endPos = len(src) + } + for searchPos <= endPos { + a := re.doExecute(nil, bsrc, src, searchPos, nmatch) if len(a) == 0 { break // no more matches } // Copy the unmatched characters before this match. - io.WriteString(buf, src[lastMatchEnd:a[0]]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:a[0]]...) + } else { + buf = append(buf, src[lastMatchEnd:a[0]]...) + } // Now insert a copy of the replacement string, but not for a // match of the empty string immediately after another match. // (Otherwise, we get double replacement for patterns that // match both empty and nonempty strings.) if a[1] > lastMatchEnd || a[0] == 0 { - io.WriteString(buf, repl(src[a[0]:a[1]])) + buf = repl(buf, a) } lastMatchEnd = a[1] // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRuneInString(src[searchPos:]) + var width int + if bsrc != nil { + _, width = utf8.DecodeRune(bsrc[searchPos:]) + } else { + _, width = utf8.DecodeRuneInString(src[searchPos:]) + } if searchPos+width > a[1] { searchPos += width } else if searchPos+1 > a[1] { @@ -463,61 +502,50 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str } // Copy the unmatched characters after the last match. - io.WriteString(buf, src[lastMatchEnd:]) + if bsrc != nil { + buf = append(buf, bsrc[lastMatchEnd:]...) + } else { + buf = append(buf, src[lastMatchEnd:]...) + } - return buf.String() + return buf } -// ReplaceAll returns a copy of src in which all matches for the Regexp -// have been replaced by repl. No support is provided for expressions -// (e.g. \1 or $1) in the replacement text. +// ReplaceAll returns a copy of src, replacing matches of the Regexp +// with the replacement string repl. Inside repl, $ signs are interpreted as +// in Expand, so for instance $1 represents the text of the first submatch. func (re *Regexp) ReplaceAll(src, repl []byte) []byte { - return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) -} - -// ReplaceAllFunc returns a copy of src in which all matches for the -// Regexp have been replaced by the return value of of function repl (whose -// first argument is the matched []byte). No support is provided for -// expressions (e.g. \1 or $1) in the replacement string. -func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { - lastMatchEnd := 0 // end position of the most recent match - searchPos := 0 // position where we next look for a match - buf := new(bytes.Buffer) - for searchPos <= len(src) { - a := re.doExecute(nil, src, "", searchPos, 2) - if len(a) == 0 { - break // no more matches - } - - // Copy the unmatched characters before this match. - buf.Write(src[lastMatchEnd:a[0]]) - - // Now insert a copy of the replacement string, but not for a - // match of the empty string immediately after another match. - // (Otherwise, we get double replacement for patterns that - // match both empty and nonempty strings.) - if a[1] > lastMatchEnd || a[0] == 0 { - buf.Write(repl(src[a[0]:a[1]])) - } - lastMatchEnd = a[1] - - // Advance past this match; always advance at least one character. - _, width := utf8.DecodeRune(src[searchPos:]) - if searchPos+width > a[1] { - searchPos += width - } else if searchPos+1 > a[1] { - // This clause is only needed at the end of the input - // string. In that case, DecodeRuneInString returns width=0. - searchPos++ - } else { - searchPos = a[1] - } + n := 2 + if bytes.IndexByte(repl, '$') >= 0 { + n = 2 * (re.numSubexp + 1) } + srepl := "" + b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { + if len(srepl) != len(repl) { + srepl = string(repl) + } + return re.expand(dst, srepl, src, "", match) + }) + return b +} - // Copy the unmatched characters after the last match. - buf.Write(src[lastMatchEnd:]) +// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp +// with the replacement bytes repl. The replacement repl is substituted directly, +// without using Expand. +func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl...) + }) +} - return buf.Bytes() +// ReplaceAllFunc returns a copy of src in which all matches of the +// Regexp have been replaced by the return value of of function repl applied +// to the matched byte slice. The replacement returned by repl is substituted +// directly, without using Expand. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { + return append(dst, repl(src[match[0]:match[1]])...) + }) } var specialBytes = []byte(`\.+*?()|[]{}^$`) @@ -687,6 +715,134 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { return ret } +// Expand appends template to dst and returns the result; during the +// append, Expand replaces variables in the template with corresponding +// matches drawn from src. The match slice should have been returned by +// FindSubmatchIndex. +// +// In the template, a variable is denoted by a substring of the form +// $name or ${name}, where name is a non-empty sequence of letters, +// digits, and underscores. A purely numeric name like $1 refers to +// the submatch with the corresponding index; other names refer to +// capturing parentheses named with the (?P...) syntax. A +// reference to an out of range or unmatched index or a name that is not +// present in the regular expression is replaced with an empty string. +// +// In the $name form, name is taken to be as long as possible: $1x is +// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. +// +// To insert a literal $ in the output, use $$ in the template. +func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { + return re.expand(dst, string(template), src, "", match) +} + +// ExpandString is like Expand but the template and source are strings. +// It appends to and returns a byte slice in order to give the calling +// code control ovr allocation. +func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { + return re.expand(dst, template, nil, src, match) +} + +func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { + for len(template) > 0 { + i := strings.Index(template, "$") + if i < 0 { + break + } + dst = append(dst, template[:i]...) + template = template[i:] + if len(template) > 1 && template[1] == '$' { + // Treat $$ as $. + dst = append(dst, '$') + template = template[2:] + continue + } + name, num, rest, ok := extract(template) + if !ok { + // Malformed; treat $ as raw text. + dst = append(dst, '$') + template = template[1:] + continue + } + template = rest + if num >= 0 { + if 2*num+1 < len(match) { + if bsrc != nil { + dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) + } else { + dst = append(dst, src[match[2*num]:match[2*num+1]]...) + } + } + } else { + for i, namei := range re.subexpNames { + if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { + if bsrc != nil { + dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) + } else { + dst = append(dst, src[match[2*i]:match[2*i+1]]...) + } + break + } + } + } + } + dst = append(dst, template...) + return dst +} + +// extract returns the name from a leading "$name" or "${name}" in str. +// If it is a number, extract returns num set to that number; otherwise num = -1. +func extract(str string) (name string, num int, rest string, ok bool) { + if len(str) < 2 || str[0] != '$' { + return + } + brace := false + if str[1] == '{' { + brace = true + str = str[2:] + } else { + str = str[1:] + } + i := 0 + for i < len(str) { + rune, size := utf8.DecodeRuneInString(str[i:]) + if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { + break + } + i += size + } + if i == 0 { + // empty name is not okay + return + } + name = str[:i] + if brace { + if i >= len(str) || str[i] != '}' { + // missing closing brace + return + } + i++ + } + + // Parse number. + num = 0 + for i := 0; i < len(name); i++ { + if name[i] < '0' || '9' < name[i] || num >= 1e8 { + num = -1 + break + } + num = num*10 + int(name[i]) - '0' + } + // Disallow leading zeros. + if name[0] == '0' && len(name) > 1 { + num = -1 + } + + rest = str[i:] + ok = true + return +} + // FindSubmatchIndex returns a slice holding the index pairs identifying the // leftmost match of the regular expression in b and the matches, if any, of // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions