html: speed up UnescapeString

Add benchmarks for for sparsely escaped and densely escaped strings. Then speed up the sparse unescaping part heavily by using IndexByte and copy to skip the parts containing no escaping very fast. Unescaping densely escaped strings slower because of the new function call overhead. But sparsely encoded strings are seen more often in the utf8 enabled web. We win part of the speed back by looking up entityName differently. benchmark old ns/op new ns/op delta BenchmarkEscape 31680 31396 -0.90% BenchmarkEscapeNone 6507 6872 +5.61% BenchmarkUnescape 36481 48298 +32.39% BenchmarkUnescapeNone 332 325 -2.11% BenchmarkUnescapeSparse 8836 3221 -63.55% BenchmarkUnescapeDense 30639 32224 +5.17% Change-Id: If606cb01897a40eefe35ba98f2ff23bb25251606 Reviewed-on: https://go-review.googlesource.com/10172 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
golang · Aug 22, 2015 · 5b92028 · 5b92028
1 parent 5f859ba
commit 5b92028
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 31 deletions.
diff --git a/src/html/escape.go b/src/html/escape.go
@@ -57,8 +57,9 @@ var replacementTable = [...]rune{
 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
 // Precondition: b[src] == '&' && dst <= src.
-// attribute should be true if parsing an attribute value.
-func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
+func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
+	const attribute = false
+
 	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
 
 	// i starts at 1 because we already know that s[0] == '&'.
@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 		break
 	}
 
-	entityName := string(s[1:i])
-	if entityName == "" {
+	entityName := s[1:i]
+	if len(entityName) == 0 {
 		// No-op.
 	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 		// No-op.
-	} else if x := entity[entityName]; x != 0 {
+	} else if x := entity[string(entityName)]; x != 0 {
 		return dst + utf8.EncodeRune(b[dst:], x), src + i
-	} else if x := entity2[entityName]; x[0] != 0 {
+	} else if x := entity2[string(entityName)]; x[0] != 0 {
 		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 	} else if !attribute {
@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 			maxLen = longestEntityWithoutSemicolon
 		}
 		for j := maxLen; j > 1; j-- {
-			if x := entity[entityName[:j]]; x != 0 {
+			if x := entity[string(entityName[:j])]; x != 0 {
 				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
 			}
 		}
@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
 	return dst1, src1
 }
 
-// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
-func unescape(b []byte) []byte {
-	for i, c := range b {
-		if c == '&' {
-			dst, src := unescapeEntity(b, i, i, false)
-			for src < len(b) {
-				c := b[src]
-				if c == '&' {
-					dst, src = unescapeEntity(b, dst, src, false)
-				} else {
-					b[dst] = c
-					dst, src = dst+1, src+1
-				}
-			}
-			return b[0:dst]
-		}
-	}
-	return b
-}
-
 var htmlEscaper = strings.NewReplacer(
 	`&`, "&amp;",
 	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
@@ -208,8 +189,29 @@ func EscapeString(s string) string {
 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 // always true.
 func UnescapeString(s string) string {
-	if !strings.Contains(s, "&") {
+	i := strings.IndexByte(s, '&')
+
+	if i < 0 {
 		return s
 	}
-	return string(unescape([]byte(s)))
+
+	b := []byte(s)
+	dst, src := unescapeEntity(b, i, i)
+	for len(s[src:]) > 0 {
+		if s[src] == '&' {
+			i = 0
+		} else {
+			i = strings.IndexByte(s[src:], '&')
+		}
+		if i < 0 {
+			dst += copy(b[dst:], s[src:])
+			break
+		}
+
+		if i > 0 {
+			copy(b[dst:], s[src:src+i])
+		}
+		dst, src = unescapeEntity(b, dst+i, src+i)
+	}
+	return string(b[:dst])
 }
diff --git a/src/html/escape_test.go b/src/html/escape_test.go
@@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) {
 }
 
 var (
-	benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
-	benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
+	benchEscapeData     = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
+	benchEscapeNone     = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
+	benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
+	benchUnescapeDense  = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
 )
 
 func BenchmarkEscape(b *testing.B) {
@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
 		n += len(UnescapeString(s))
 	}
 }
+
+func BenchmarkUnescapeSparse(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(benchUnescapeSparse))
+	}
+}
+
+func BenchmarkUnescapeDense(b *testing.B) {
+	n := 0
+	for i := 0; i < b.N; i++ {
+		n += len(UnescapeString(benchUnescapeDense))
+	}
+}