6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module golang.org/x/net
go 1.17

require (
golang.org/x/sys v0.3.0
golang.org/x/term v0.3.0
golang.org/x/text v0.5.0
golang.org/x/sys v0.5.0
golang.org/x/term v0.5.0
golang.org/x/text v0.7.0
)
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.3.0 h1:qoo4akIqOcDME5bhc/NgxUdovd6BSS2uMsVjB56q1xI=
golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA=
golang.org/x/term v0.5.0 h1:n2a8QNdAb0sZNpU9R1ALUXBbY+w51fCQDN+7EdxNBsY=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
Expand Down
270 changes: 270 additions & 0 deletions html/comment_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import (
"bytes"
"testing"
)

// TestComments exhaustively tests every 'interesting' N-byte string is
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
//
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
// sampled from the interestingCommentBytes const string, below. These cover
// all of the possible state transitions from comment-related parser states, as
// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
// and subsequent sections).
//
// The spec is written as an explicit state machine that, as a side effect,
// accumulates "the comment token's data" to a separate buffer.
// Tokenizer.readComment in this package does not have an explicit state
// machine and usually returns the comment text as a sub-slice of the input,
// between the opening '<' and closing '>' or EOF. This test confirms that the
// two algorithms match.
func TestComments(t *testing.T) {
const prefix = "<!--"
const suffixLen = 6
buffer := make([]byte, 0, len(prefix)+suffixLen)
testAllComments(t, append(buffer, prefix...))
}

// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
// It's equivalent to any other non-interesting byte (other than being replaced
// by U+FFFD REPLACEMENT CHARACTER).
//
// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
// testOneComment below breaks the loop instead.
//
// 'x' represents all other "non-interesting" comment bytes.
var interestingCommentBytes = [...]byte{
'!', '-', '<', '>', 'x',
}

// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
// interesting bytes and then tests that this package's tokenization matches
// the HTML spec.
//
// Precondition: len(buffer) < cap(buffer)
// Precondition: string(buffer[:4]) == "<!--"
func testAllComments(t *testing.T, buffer []byte) {
for _, interesting := range interestingCommentBytes {
b := append(buffer, interesting)
testOneComment(t, b)
if len(b) < cap(b) {
testAllComments(t, b)
}
}
}

func testOneComment(t *testing.T, b []byte) {
z := NewTokenizer(bytes.NewReader(b))
if next := z.Next(); next != CommentToken {
t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
}
gotRemainder := string(b[len(z.Raw()):])
gotComment := string(z.Text())

i := len("<!--")
wantBuffer := []byte(nil)
loop:
for state := 43; ; {
// Consume the next input character, handling EOF.
if i >= len(b) {
break
}
nextInputCharacter := b[i]
i++

switch state {
case 43: // 13.2.5.43 Comment start state.
switch nextInputCharacter {
case '-':
state = 44
case '>':
break loop
default:
i-- // Reconsume.
state = 45
}

case 44: // 13.2.5.44 Comment start dash state.
switch nextInputCharacter {
case '-':
state = 51
case '>':
break loop
default:
wantBuffer = append(wantBuffer, '-')
i-- // Reconsume.
state = 45
}

case 45: // 13.2.5.45 Comment state.
switch nextInputCharacter {
case '-':
state = 50
case '<':
wantBuffer = append(wantBuffer, '<')
state = 46
default:
wantBuffer = append(wantBuffer, nextInputCharacter)
}

case 46: // 13.2.5.46 Comment less-than sign state.
switch nextInputCharacter {
case '!':
wantBuffer = append(wantBuffer, '!')
state = 47
case '<':
wantBuffer = append(wantBuffer, '<')
state = 46
default:
i-- // Reconsume.
state = 45
}

case 47: // 13.2.5.47 Comment less-than sign bang state.
switch nextInputCharacter {
case '-':
state = 48
default:
i-- // Reconsume.
state = 45
}

case 48: // 13.2.5.48 Comment less-than sign bang dash state.
switch nextInputCharacter {
case '-':
state = 49
default:
i-- // Reconsume.
state = 50
}

case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
switch nextInputCharacter {
case '>':
break loop
default:
i-- // Reconsume.
state = 51
}

case 50: // 13.2.5.50 Comment end dash state.
switch nextInputCharacter {
case '-':
state = 51
default:
wantBuffer = append(wantBuffer, '-')
i-- // Reconsume.
state = 45
}

case 51: // 13.2.5.51 Comment end state.
switch nextInputCharacter {
case '!':
state = 52
case '-':
wantBuffer = append(wantBuffer, '-')
case '>':
break loop
default:
wantBuffer = append(wantBuffer, "--"...)
i-- // Reconsume.
state = 45
}

case 52: // 13.2.5.52 Comment end bang state.
switch nextInputCharacter {
case '-':
wantBuffer = append(wantBuffer, "--!"...)
state = 50
case '>':
break loop
default:
wantBuffer = append(wantBuffer, "--!"...)
i-- // Reconsume.
state = 45
}

default:
t.Fatalf("input=%q: unexpected state %d", b, state)
}
}

wantRemainder := ""
if i < len(b) {
wantRemainder = string(b[i:])
}
wantComment := string(wantBuffer)
if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
b, gotComment, gotRemainder, wantComment, wantRemainder)
}
}

// This table below summarizes the HTML-comment-related state machine from
// 13.2.5.43 "Comment start state" and subsequent sections.
// https://html.spec.whatwg.org/#comment-start-state
//
// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
// initial 13.2.5.1 "Data state":
// - "<" moves to 13.2.5.6 "Tag open state",
// - "!" moves to 13.2.5.42 "Markup declaration open state",
// - "--" moves to 13.2.5.43 "Comment start state".
// Each of these transitions are the only way to get to the 6/42/43 states.
//
// State ! - < > NUL EOF default HTML spec section
// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state
// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state
// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state
// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state
// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state
// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state
// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state
// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state
// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state
// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state
//
// State 43 is the "Comment start state" meaning that we've only seen "<!--"
// and nothing else. Similarly, state 44 means that we've only seen "<!---",
// with three dashes, and nothing else. For the other states, we deduce
// (working backwards) that the immediate prior input must be:
// - 45 something that's not '-'
// - 46 "<"
// - 47 "<!"
// - 48 "<!-"
// - 49 "<!--" not including the opening "<!--"
// - 50 "-" not including the opening "<!--" and also not "--"
// - 51 "--" not including the opening "<!--"
// - 52 "--!"
//
// The table cell actions:
// - ... do the default action
// - A! append "!" to the comment token's data.
// - A- append "-" to the comment token's data.
// - A-- append "--" to the comment token's data.
// - A--! append "--!" to the comment token's data.
// - A< append "<" to the comment token's data.
// - A? append "\uFFFD" to the comment token's data.
// - Ax append the current input character to the comment token's data.
// - E0 parse error (abrupt-closing-of-empty-comment).
// - E1 parse error (eof-in-comment).
// - E2 parse error (unexpected-null-character).
// - E3 parse error (nested-comment).
// - E4 parse error (incorrectly-closed-comment).
// - T emit the current comment token.
// - Z emit an end-of-file token.
// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations).
// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations).
// - sNN switch to the 13.2.5.NN state (after any A* or E* operations).
// - tNN stay in the 13.2.5.NN state (after any A* or E* operations).
//
// The E* actions are called errors in the HTML spec but they are not fatal
// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
// the parser"). They are warnings that, in practice, browsers simply ignore.
2 changes: 1 addition & 1 deletion html/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (p *parser) clearStackToContext(s scope) {
}
}

// parseGenericRawTextElements implements the generic raw text element parsing
// parseGenericRawTextElement implements the generic raw text element parsing
// algorithm defined in 12.2.6.2.
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
Expand Down
49 changes: 41 additions & 8 deletions html/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,11 @@ scriptDataDoubleEscapeEnd:
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
// When modifying this function, consider manually increasing the suffixLen
// constant in func TestComments, from 6 to e.g. 9 or more. That increase
// should only be temporary, not committed, as it exponentially affects the
// test running time.

z.data.start = z.raw.end
defer func() {
if z.data.end < z.data.start {
Expand All @@ -611,11 +616,7 @@ func (z *Tokenizer) readComment() {
for {
c := z.readByte()
if z.err != nil {
// Ignore up to two dashes at EOF.
if dashCount > 2 {
dashCount = 2
}
z.data.end = z.raw.end - dashCount
z.data.end = z.calculateAbruptCommentDataEnd()
return
}
switch c {
Expand All @@ -631,12 +632,15 @@ func (z *Tokenizer) readComment() {
if dashCount >= 2 {
c = z.readByte()
if z.err != nil {
z.data.end = z.raw.end
z.data.end = z.calculateAbruptCommentDataEnd()
return
}
if c == '>' {
} else if c == '>' {
z.data.end = z.raw.end - len("--!>")
return
} else if c == '-' {
dashCount = 1
beginning = false
continue
}
}
}
Expand All @@ -645,6 +649,35 @@ func (z *Tokenizer) readComment() {
}
}

func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
raw := z.Raw()
const prefixLen = len("<!--")
if len(raw) >= prefixLen {
raw = raw[prefixLen:]
if hasSuffix(raw, "--!") {
return z.raw.end - 3
} else if hasSuffix(raw, "--") {
return z.raw.end - 2
} else if hasSuffix(raw, "-") {
return z.raw.end - 1
}
}
return z.raw.end
}

func hasSuffix(b []byte, suffix string) bool {
if len(b) < len(suffix) {
return false
}
b = b[len(b)-len(suffix):]
for i := range b {
if b[i] != suffix[i] {
return false
}
}
return true
}

// readUntilCloseAngle reads until the next ">".
func (z *Tokenizer) readUntilCloseAngle() {
z.data.start = z.raw.end
Expand Down
Loading