From 5b920281558aa24a4cd422a7ecf9d651355d9936 Mon Sep 17 00:00:00 2001 From: Ingo Oeser Date: Sat, 9 May 2015 17:55:05 +0200 Subject: [PATCH] html: speed up UnescapeString Add benchmarks for for sparsely escaped and densely escaped strings. Then speed up the sparse unescaping part heavily by using IndexByte and copy to skip the parts containing no escaping very fast. Unescaping densely escaped strings slower because of the new function call overhead. But sparsely encoded strings are seen more often in the utf8 enabled web. We win part of the speed back by looking up entityName differently. benchmark old ns/op new ns/op delta BenchmarkEscape 31680 31396 -0.90% BenchmarkEscapeNone 6507 6872 +5.61% BenchmarkUnescape 36481 48298 +32.39% BenchmarkUnescapeNone 332 325 -2.11% BenchmarkUnescapeSparse 8836 3221 -63.55% BenchmarkUnescapeDense 30639 32224 +5.17% Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606 Reviewed-on: https://go-review.googlesource.com/10172 Reviewed-by: Brad Fitzpatrick Run-TryBot: Brad Fitzpatrick TryBot-Result: Gobot Gobot --- src/html/escape.go | 60 +++++++++++++++++++++-------------------- src/html/escape_test.go | 20 ++++++++++++-- 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/src/html/escape.go b/src/html/escape.go index f50a4b937a79b..ab6fd1c7b4b34 100644 --- a/src/html/escape.go +++ b/src/html/escape.go @@ -57,8 +57,9 @@ var replacementTable = [...]rune{ // unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. // Precondition: b[src] == '&' && dst <= src. -// attribute should be true if parsing an attribute value. -func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { +func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { + const attribute = false + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // i starts at 1 because we already know that s[0] == '&'. @@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { break } - entityName := string(s[1:i]) - if entityName == "" { + entityName := s[1:i] + if len(entityName) == 0 { // No-op. } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { // No-op. - } else if x := entity[entityName]; x != 0 { + } else if x := entity[string(entityName)]; x != 0 { return dst + utf8.EncodeRune(b[dst:], x), src + i - } else if x := entity2[entityName]; x[0] != 0 { + } else if x := entity2[string(entityName)]; x[0] != 0 { dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i } else if !attribute { @@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { maxLen = longestEntityWithoutSemicolon } for j := maxLen; j > 1; j-- { - if x := entity[entityName[:j]]; x != 0 { + if x := entity[string(entityName[:j])]; x != 0 { return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 } } @@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { return dst1, src1 } -// unescape unescapes b's entities in-place, so that "a<b" becomes "a 0 { + if s[src] == '&' { + i = 0 + } else { + i = strings.IndexByte(s[src:], '&') + } + if i < 0 { + dst += copy(b[dst:], s[src:]) + break + } + + if i > 0 { + copy(b[dst:], s[src:src+i]) + } + dst, src = unescapeEntity(b, dst+i, src+i) + } + return string(b[:dst]) } diff --git a/src/html/escape_test.go b/src/html/escape_test.go index 3702626a3dccf..8b51a55409fa5 100644 --- a/src/html/escape_test.go +++ b/src/html/escape_test.go @@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) { } var ( - benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100) - benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100) + benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100) + benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100) + benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&", 10) + benchUnescapeDense = strings.Repeat("&< & <", 100) ) func BenchmarkEscape(b *testing.B) { @@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) { n += len(UnescapeString(s)) } } + +func BenchmarkUnescapeSparse(b *testing.B) { + n := 0 + for i := 0; i < b.N; i++ { + n += len(UnescapeString(benchUnescapeSparse)) + } +} + +func BenchmarkUnescapeDense(b *testing.B) { + n := 0 + for i := 0; i < b.N; i++ { + n += len(UnescapeString(benchUnescapeDense)) + } +}