Skip to content

Commit

Permalink
html: have Render escape comments less often
Browse files Browse the repository at this point in the history
Fixes golang/go#58246

Change-Id: I3effbd2afd7e363a42baa4db20691e57c9a08389
Reviewed-on: https://go-review.googlesource.com/c/net/+/469056
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Kunpei Sakai <namusyaka@gmail.com>
Reviewed-by: Damien Neil <dneil@google.com>
  • Loading branch information
nigeltao committed Feb 28, 2023
1 parent 569fe81 commit 1d46ed8
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 33 deletions.
29 changes: 25 additions & 4 deletions html/comment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ package html

import (
"bytes"
"strings"
"testing"
)

// TestComments exhaustively tests every 'interesting' N-byte string is
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
// inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
//
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
// sampled from the interestingCommentBytes const string, below. These cover
Expand All @@ -27,8 +28,8 @@ import (
// two algorithms match.
func TestComments(t *testing.T) {
const prefix = "<!--"
const suffixLen = 6
buffer := make([]byte, 0, len(prefix)+suffixLen)
const maxSuffixLen = 6
buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
testAllComments(t, append(buffer, prefix...))
}

Expand Down Expand Up @@ -205,6 +206,26 @@ loop:
if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
b, gotComment, gotRemainder, wantComment, wantRemainder)
return
}

// suffix is the "N-4 byte suffix" per the TestComments comment.
suffix := string(b[4:])

// Test that a round trip, rendering (escaped) and re-parsing, of a comment
// token (with that suffix as the Token.Data) preserves that string.
tok := Token{
Type: CommentToken,
Data: suffix,
}
z2 := NewTokenizer(strings.NewReader(tok.String()))
if next := z2.Next(); next != CommentToken {
t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
}
gotComment2 := string(z2.Text())
if gotComment2 != suffix {
t.Errorf("round-trip\ngot: %q\nwant: %q", gotComment2, suffix)
return
}
}

Expand Down
81 changes: 81 additions & 0 deletions html/escape.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,87 @@ func lower(b []byte) []byte {
return b
}

// escapeComment is like func escape but escapes its input bytes less often.
// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
// meaningful and (2) contain angle brackets that we'd like to avoid escaping
// unless we have to.
//
// "We have to" includes the '&' byte, since that introduces other escapes.
//
// It also includes those bytes (not including EOF) that would otherwise end
// the comment. Per the summary table at the bottom of comment_test.go, this is
// the '>' byte that, per above, we'd like to avoid escaping unless we have to.
//
// Studying the summary table (and T actions in its '>' column) closely, we
// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
// start of the comment data. State 52 is after a '!'. The other three states
// are after a '-'.
//
// Our algorithm is thus to escape every '&' and to escape '>' if and only if:
// - The '>' is after a '!' or '-' (in the unescaped data) or
// - The '>' is at the start of the comment data (after the opening "<!--").
func escapeComment(w writer, s string) error {
// When modifying this function, consider manually increasing the
// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
// That increase should only be temporary, not committed, as it
// exponentially affects the test running time.

if len(s) == 0 {
return nil
}

// Loop:
// - Grow j such that s[i:j] does not need escaping.
// - If s[j] does need escaping, output s[i:j] and an escaped s[j],
// resetting i and j to point past that s[j] byte.
i := 0
for j := 0; j < len(s); j++ {
escaped := ""
switch s[j] {
case '&':
escaped = "&amp;"

case '>':
if j > 0 {
if prev := s[j-1]; (prev != '!') && (prev != '-') {
continue
}
}
escaped = "&gt;"

default:
continue
}

if i < j {
if _, err := w.WriteString(s[i:j]); err != nil {
return err
}
}
if _, err := w.WriteString(escaped); err != nil {
return err
}
i = j + 1
}

if i < len(s) {
if _, err := w.WriteString(s[i:]); err != nil {
return err
}
}
return nil
}

// escapeCommentString is to EscapeString as escapeComment is to escape.
func escapeCommentString(s string) string {
if strings.IndexAny(s, "&>") == -1 {
return s
}
var buf bytes.Buffer
escapeComment(&buf, s)
return buf.String()
}

const escapedChars = "&'<>\"\r"

func escape(w writer, s string) error {
Expand Down
2 changes: 1 addition & 1 deletion html/render.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func render1(w writer, n *Node) error {
if _, err := w.WriteString("<!--"); err != nil {
return err
}
if err := escape(w, n.Data); err != nil {
if err := escapeComment(w, n.Data); err != nil {
return err
}
if _, err := w.WriteString("-->"); err != nil {
Expand Down
10 changes: 5 additions & 5 deletions html/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ func (t Token) String() string {
case SelfClosingTagToken:
return "<" + t.tagString() + "/>"
case CommentToken:
return "<!--" + EscapeString(t.Data) + "-->"
return "<!--" + escapeCommentString(t.Data) + "-->"
case DoctypeToken:
return "<!DOCTYPE " + EscapeString(t.Data) + ">"
}
Expand Down Expand Up @@ -598,10 +598,10 @@ scriptDataDoubleEscapeEnd:
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
// When modifying this function, consider manually increasing the suffixLen
// constant in func TestComments, from 6 to e.g. 9 or more. That increase
// should only be temporary, not committed, as it exponentially affects the
// test running time.
// When modifying this function, consider manually increasing the
// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
// That increase should only be temporary, not committed, as it
// exponentially affects the test running time.

z.data.start = z.raw.end
defer func() {
Expand Down
48 changes: 25 additions & 23 deletions html/token_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@ const issue58246 = `<!--[if gte mso 12]>
</o:OfficeDocumentSettings>
</xml>
<![endif]-->`
const issue58246Rendered = `<!--[if gte mso 12]&gt;
&lt;xml&gt;
&lt;o:OfficeDocumentSettings&gt;
&lt;o:AllowPNG/&gt;
&lt;o:PixelsPerInch&gt;96&lt;/o:PixelsPerInch&gt;
&lt;/o:OfficeDocumentSettings&gt;
&lt;/xml&gt;
&lt;![endif]-->`

type tokenTest struct {
// A short description of the test case.
Expand Down Expand Up @@ -332,7 +324,7 @@ var tokenTests = []tokenTest{
{
"comment3",
"a<!--x>-->z",
"a$<!--x&gt;-->$z",
"a$<!--x>-->$z",
},
{
"comment4",
Expand All @@ -352,7 +344,7 @@ var tokenTests = []tokenTest{
{
"comment7",
"a<!---<>z",
"a$<!---&lt;&gt;z-->",
"a$<!---<>z-->",
},
{
"comment8",
Expand Down Expand Up @@ -407,12 +399,12 @@ var tokenTests = []tokenTest{
{
"comment18",
"a<!--<!-->z",
"a$<!--&lt;!-->$z",
"a$<!--<!-->$z",
},
{
"comment19",
"a<!--<!--",
"a$<!--&lt;!-->",
"a$<!--<!-->",
},
{
"comment20",
Expand All @@ -427,7 +419,7 @@ var tokenTests = []tokenTest{
{
"comment22",
"a<!--!--!<--!-->z",
"a$<!--!--!&lt;--!-->$z",
"a$<!--!--!<--!-->$z",
},
{
"comment23",
Expand All @@ -437,27 +429,27 @@ var tokenTests = []tokenTest{
{
"comment24",
"a<!--&gt;>x",
"a$<!--&gt;&gt;x-->",
"a$<!--&gt;>x-->",
},
{
"comment25",
"a<!--&gt;&gt;",
"a$<!--&gt;&gt;-->",
"a$<!--&gt;>-->",
},
{
"comment26",
"a<!--&gt;&gt;-",
"a$<!--&gt;&gt;-->",
"a$<!--&gt;>-->",
},
{
"comment27",
"a<!--&gt;&gt;-->z",
"a$<!--&gt;&gt;-->$z",
"a$<!--&gt;>-->$z",
},
{
"comment28",
"a<!--&amp;&gt;-->z",
"a$<!--&amp;&gt;-->$z",
"a$<!--&amp;>-->$z",
},
{
"comment29",
Expand All @@ -469,10 +461,20 @@ var tokenTests = []tokenTest{
"a<!--&nosuchentity;-->z",
"a$<!--&amp;nosuchentity;-->$z",
},
{
"comment31",
"a<!--i>>j-->z",
"a$<!--i>>j-->$z",
},
{
"comment32",
"a<!--i!>>j-->z",
"a$<!--i!&gt;>j-->$z",
},
// https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions
// says "[For] Windows Outlook 2003 and above... conditional comments allow
// us to add bits of HTML that are only read by the Word-based versions of
// Outlook". TODO: these comments (with angle brackets) should pass through
// Outlook". These comments (with angle brackets) should pass through
// unchanged (by this Go package) when rendering.
//
// We should also still escape ">" as "&gt;" when necessary.
Expand All @@ -484,22 +486,22 @@ var tokenTests = []tokenTest{
{
"issue48237CommentWithAmpgtsemi1",
"a<!--<p></p>&lt;!--[video]--&gt;-->z",
"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--&gt;-->$z",
"a$<!--<p></p><!--[video]--&gt;-->$z",
},
{
"issue48237CommentWithAmpgtsemi2",
"a<!--<p></p>&lt;!--[video]--!&gt;-->z",
"a$<!--&lt;p&gt;&lt;/p&gt;&lt;!--[video]--!&gt;-->$z",
"a$<!--<p></p><!--[video]--!&gt;-->$z",
},
{
"issue58246MicrosoftOutlookComment1",
"a<!--[if mso]> your code <![endif]-->z",
"a$<!--[if mso]&gt; your code &lt;![endif]-->$z",
"a$<!--[if mso]> your code <![endif]-->$z",
},
{
"issue58246MicrosoftOutlookComment2",
"a" + issue58246 + "z",
"a$" + issue58246Rendered + "$z",
"a$" + issue58246 + "$z",
},
// An attribute with a backslash.
{
Expand Down

0 comments on commit 1d46ed8

Please sign in to comment.