html: have Render escape comments less often

Fixes golang/go#58246 Change-Id: I3effbd2afd7e363a42baa4db20691e57c9a08389 Reviewed-on: https://go-review.googlesource.com/c/net/+/469056 TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Nigel Tao <nigeltao@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Reviewed-by: Kunpei Sakai <namusyaka@gmail.com> Reviewed-by: Damien Neil <dneil@google.com>
golang · Feb 28, 2023 · 1d46ed8 · 1d46ed8
1 parent 569fe81
commit 1d46ed8
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 33 deletions.
diff --git a/html/comment_test.go b/html/comment_test.go
@@ -6,12 +6,13 @@ package html
 
 import (
 	"bytes"
+	"strings"
 	"testing"
 )
 
 // TestComments exhaustively tests every 'interesting' N-byte string is
-// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
-// where 4 is the length of the "<!--" prefix that starts an HTML comment.
+// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
+// inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
 //
 // 'Interesting' means that the N-4 byte suffix consists entirely of bytes
 // sampled from the interestingCommentBytes const string, below. These cover
@@ -27,8 +28,8 @@ import (
 // two algorithms match.
 func TestComments(t *testing.T) {
 	const prefix = "<!--"
-	const suffixLen = 6
-	buffer := make([]byte, 0, len(prefix)+suffixLen)
+	const maxSuffixLen = 6
+	buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
 	testAllComments(t, append(buffer, prefix...))
 }
 
@@ -205,6 +206,26 @@ loop:
 	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
 		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
 			b, gotComment, gotRemainder, wantComment, wantRemainder)
+		return
+	}
+
+	// suffix is the "N-4 byte suffix" per the TestComments comment.
+	suffix := string(b[4:])
+
+	// Test that a round trip, rendering (escaped) and re-parsing, of a comment
+	// token (with that suffix as the Token.Data) preserves that string.
+	tok := Token{
+		Type: CommentToken,
+		Data: suffix,
+	}
+	z2 := NewTokenizer(strings.NewReader(tok.String()))
+	if next := z2.Next(); next != CommentToken {
+		t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
+	}
+	gotComment2 := string(z2.Text())
+	if gotComment2 != suffix {
+		t.Errorf("round-trip\ngot:  %q\nwant: %q", gotComment2, suffix)
+		return
 	}
 }
 

diff --git a/html/escape.go b/html/escape.go
@@ -193,6 +193,87 @@ func lower(b []byte) []byte {
 	return b
 }
 
+// escapeComment is like func escape but escapes its input bytes less often.
+// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
+// meaningful and (2) contain angle brackets that we'd like to avoid escaping
+// unless we have to.
+//
+// "We have to" includes the '&' byte, since that introduces other escapes.
+//
+// It also includes those bytes (not including EOF) that would otherwise end
+// the comment. Per the summary table at the bottom of comment_test.go, this is
+// the '>' byte that, per above, we'd like to avoid escaping unless we have to.
+//
+// Studying the summary table (and T actions in its '>' column) closely, we
+// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
+// start of the comment data. State 52 is after a '!'. The other three states
+// are after a '-'.
+//
+// Our algorithm is thus to escape every '&' and to escape '>' if and only if:
+//   - The '>' is after a '!' or '-' (in the unescaped data) or
+//   - The '>' is at the start of the comment data (after the opening "<!--").
+func escapeComment(w writer, s string) error {
+	// When modifying this function, consider manually increasing the
+	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
+	// That increase should only be temporary, not committed, as it
+	// exponentially affects the test running time.
+
+	if len(s) == 0 {
+		return nil
+	}
+
+	// Loop:
+	//   - Grow j such that s[i:j] does not need escaping.
+	//   - If s[j] does need escaping, output s[i:j] and an escaped s[j],
+	//     resetting i and j to point past that s[j] byte.
+	i := 0
+	for j := 0; j < len(s); j++ {
+		escaped := ""
+		switch s[j] {
+		case '&':
+			escaped = "&amp;"
+
+		case '>':
+			if j > 0 {
+				if prev := s[j-1]; (prev != '!') && (prev != '-') {
+					continue
+				}
+			}
+			escaped = "&gt;"
+
+		default:
+			continue
+		}
+
+		if i < j {
+			if _, err := w.WriteString(s[i:j]); err != nil {
+				return err
+			}
+		}
+		if _, err := w.WriteString(escaped); err != nil {
+			return err
+		}
+		i = j + 1
+	}
+
+	if i < len(s) {
+		if _, err := w.WriteString(s[i:]); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// escapeCommentString is to EscapeString as escapeComment is to escape.
+func escapeCommentString(s string) string {
+	if strings.IndexAny(s, "&>") == -1 {
+		return s
+	}
+	var buf bytes.Buffer
+	escapeComment(&buf, s)
+	return buf.String()
+}
+
 const escapedChars = "&'<>\"\r"
 
 func escape(w writer, s string) error {

diff --git a/html/render.go b/html/render.go
@@ -85,7 +85,7 @@ func render1(w writer, n *Node) error {
 		if _, err := w.WriteString("<!--"); err != nil {
 			return err
 		}
-		if err := escape(w, n.Data); err != nil {
+		if err := escapeComment(w, n.Data); err != nil {
 			return err
 		}
 		if _, err := w.WriteString("-->"); err != nil {

diff --git a/html/token.go b/html/token.go
@@ -110,7 +110,7 @@ func (t Token) String() string {
 	case SelfClosingTagToken:
 		return "<" + t.tagString() + "/>"
 	case CommentToken:
-		return "<!--" + EscapeString(t.Data) + "-->"
+		return "<!--" + escapeCommentString(t.Data) + "-->"
 	case DoctypeToken:
 		return "<!DOCTYPE " + EscapeString(t.Data) + ">"
 	}
@@ -598,10 +598,10 @@ scriptDataDoubleEscapeEnd:
 // readComment reads the next comment token starting with "<!--". The opening
 // "<!--" has already been consumed.
 func (z *Tokenizer) readComment() {
-	// When modifying this function, consider manually increasing the suffixLen
-	// constant in func TestComments, from 6 to e.g. 9 or more. That increase
-	// should only be temporary, not committed, as it exponentially affects the
-	// test running time.
+	// When modifying this function, consider manually increasing the
+	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
+	// That increase should only be temporary, not committed, as it
+	// exponentially affects the test running time.
 
 	z.data.start = z.raw.end
 	defer func() {

diff --git a/html/token_test.go b/html/token_test.go
@@ -23,14 +23,6 @@ const issue58246 = `<!--[if gte mso 12]>
       </o:OfficeDocumentSettings>
     </xml>
 <![endif]-->`
-const issue58246Rendered = `<!--[if gte mso 12]&gt;
-  &lt;xml&gt;
-      &lt;o:OfficeDocumentSettings&gt;
-      &lt;o:AllowPNG/&gt;
-      &lt;o:PixelsPerInch&gt;96&lt;/o:PixelsPerInch&gt;
-      &lt;/o:OfficeDocumentSettings&gt;
-    &lt;/xml&gt;
-&lt;![endif]-->`
 
 type tokenTest struct {
 	// A short description of the test case.
@@ -332,7 +324,7 @@ var tokenTests = []tokenTest{
 	{
 		"comment3",
 		"a<!--x>-->z",
-		"a$<!--x&gt;-->$z",
+		"a$<!--x>-->$z",
 	},
 	{
 		"comment4",
@@ -352,7 +344,7 @@ var tokenTests = []tokenTest{
 	{
 		"comment7",
 		"a<!---<>z",
-		"a$<!---&lt;&gt;z-->",
+		"a$<!---<>z-->",
 	},
 	{
 		"comment8",
@@ -407,12 +399,12 @@ var tokenTests = []tokenTest{
 	{
 		"comment18",
 		"a<!--<!-->z",
-		"a$<!--&lt;!-->$z",
+		"a$<!--<!-->$z",
 	},
 	{
 		"comment19",
 		"a<!--<!--",
-		"a$<!--&lt;!-->",
+		"a$<!--<!-->",
 	},
 	{
 		"comment20",
@@ -427,7 +419,7 @@ var tokenTests = []tokenTest{
 	{
 		"comment22",
 		"a<!--!--!<--!-->z",
-		"a$<!--!--!&lt;--!-->$z",
+		"a$<!--!--!<--!-->$z",
 	},
 	{
 		"comment23",
@@ -437,27 +429,27 @@ var tokenTests = []tokenTest{
 	{
 		"comment24",
 		"a<!--&gt;>x",
-		"a$<!--&gt;&gt;x-->",
+		"a$<!--&gt;>x-->",
 	},
 	{
 		"comment25",
 		"a<!--&gt;&gt;",
-		"a$<!--&gt;&gt;-->",
+		"a$<!--&gt;>-->",
 	},
 	{
 		"comment26",
 		"a<!--&gt;&gt;-",
-		"a$<!--&gt;&gt;-->",
+		"a$<!--&gt;>-->",
 	},
 	{
 		"comment27",
 		"a<!--&gt;&gt;-->z",
-		"a$<!--&gt;&gt;-->$z",
+		"a$<!--&gt;>-->$z",
 	},
 	{
 		"comment28",
 		"a<!--&amp;&gt;-->z",
-		"a$<!--&amp;&gt;-->$z",
+		"a$<!--&amp;>-->$z",
 	},
 	{
 		"comment29",
@@ -469,10 +461,20 @@ var tokenTests = []tokenTest{
 		"a<!--&nosuchentity;-->z",
 		"a$<!--&amp;nosuchentity;-->$z",
 	},
+	{
+		"comment31",
+		"a<!--i>>j-->z",
+		"a$<!--i>>j-->$z",
+	},
+	{
+		"comment32",
+		"a<!--i!>>j-->z",
+		"a$<!--i!&gt;>j-->$z",
+	},
 	// https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions
 	// says "[For] Windows Outlook 2003 and above... conditional comments allow
 	// us to add bits of HTML that are only read by the Word-based versions of
-	// Outlook". TODO: these comments (with angle brackets) should pass through
+	// Outlook". These comments (with angle brackets) should pass through
 	// unchanged (by this Go package) when rendering.
 	//
 	// We should also still escape ">" as "&gt;" when necessary.
@@ -484,22 +486,22 @@ var tokenTests = []tokenTest{
 	{
 		"issue48237CommentWithAmpgtsemi1",
 		"a<!--<p></p>&lt;!--[video]--&gt;-->z",
-		"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--&gt;-->$z",
+		"a$<!--<p></p><!--[video]--&gt;-->$z",
 	},
 	{
 		"issue48237CommentWithAmpgtsemi2",
 		"a<!--<p></p>&lt;!--[video]--!&gt;-->z",
-		"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--!&gt;-->$z",
+		"a$<!--<p></p><!--[video]--!&gt;-->$z",
 	},
 	{
 		"issue58246MicrosoftOutlookComment1",
 		"a<!--[if mso]> your code <![endif]-->z",
-		"a$<!--[if mso]&gt; your code &lt;![endif]-->$z",
+		"a$<!--[if mso]> your code <![endif]-->$z",
 	},
 	{
 		"issue58246MicrosoftOutlookComment2",
 		"a" + issue58246 + "z",
-		"a$" + issue58246Rendered + "$z",
+		"a$" + issue58246 + "$z",
 	},
 	// An attribute with a backslash.
 	{