Skip to content

Commit

Permalink
html: speed up UnescapeString
Browse files Browse the repository at this point in the history
Add benchmarks for for sparsely escaped and densely escaped strings.
Then speed up the sparse unescaping part heavily by using IndexByte and
copy to skip the parts containing no escaping very fast.

Unescaping densely escaped strings slower because of
the new function call overhead. But sparsely encoded strings are seen
more often in the utf8 enabled web.

We win part of the speed back by looking up entityName differently.

	benchmark                  old ns/op    new ns/op    delta
	BenchmarkEscape                31680        31396   -0.90%
	BenchmarkEscapeNone             6507         6872   +5.61%
	BenchmarkUnescape              36481        48298  +32.39%
	BenchmarkUnescapeNone            332          325   -2.11%
	BenchmarkUnescapeSparse         8836         3221  -63.55%
	BenchmarkUnescapeDense         30639        32224   +5.17%

Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606
Reviewed-on: https://go-review.googlesource.com/10172
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
  • Loading branch information
nightlyone authored and bradfitz committed Aug 22, 2015
1 parent 5f859ba commit 5b92028
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 31 deletions.
60 changes: 31 additions & 29 deletions src/html/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ var replacementTable = [...]rune{
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
// attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
const attribute = false

// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference

// i starts at 1 because we already know that s[0] == '&'.
Expand Down Expand Up @@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
break
}

entityName := string(s[1:i])
if entityName == "" {
entityName := s[1:i]
if len(entityName) == 0 {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
} else if x := entity[string(entityName)]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
} else if x := entity2[string(entityName)]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
Expand All @@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
if x := entity[string(entityName[:j])]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
Expand All @@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
return dst1, src1
}

// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
func unescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, false)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, false)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}

var htmlEscaper = strings.NewReplacer(
`&`, "&amp;",
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
Expand All @@ -208,8 +189,29 @@ func EscapeString(s string) string {
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
if !strings.Contains(s, "&") {
i := strings.IndexByte(s, '&')

if i < 0 {
return s
}
return string(unescape([]byte(s)))

b := []byte(s)
dst, src := unescapeEntity(b, i, i)
for len(s[src:]) > 0 {
if s[src] == '&' {
i = 0
} else {
i = strings.IndexByte(s[src:], '&')
}
if i < 0 {
dst += copy(b[dst:], s[src:])
break
}

if i > 0 {
copy(b[dst:], s[src:src+i])
}
dst, src = unescapeEntity(b, dst+i, src+i)
}
return string(b[:dst])
}
20 changes: 18 additions & 2 deletions src/html/escape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) {
}

var (
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
benchUnescapeDense = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
)

func BenchmarkEscape(b *testing.B) {
Expand Down Expand Up @@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
n += len(UnescapeString(s))
}
}

func BenchmarkUnescapeSparse(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeSparse))
}
}

func BenchmarkUnescapeDense(b *testing.B) {
n := 0
for i := 0; i < b.N; i++ {
n += len(UnescapeString(benchUnescapeDense))
}
}

0 comments on commit 5b92028

Please sign in to comment.