Skip to content
Permalink
d902791b50
Switch branches/tags
Go to file
This is one of several changes that were part of a larger rewrite
which I made in early 2019 after switching to the new number literal
syntax implementation. The purpose of the rewrite was to simplify
reading of source code (Unicode character by character) and speed up
the scanner but was never submitted for review due to other priorities.

Part 3 of 3:

This change contains a complete rewrite of source.go, the file that
implements reading individual Unicode characters from the source.
The new implementation is easier to use and has simpler literal
buffer management, resulting in faster scanner and thus parser
performance.

Thew new source.go (internal) API is centered around nextch() which
advances the scanner by one character. The scanner has been adjusted
around nextch() and now consistently does one character look-ahead
(there's no need for complicated ungetr-ing anymore). Only in one
case backtrack is needed (when finding '..' rather than '...') and
that case is now more cleanly solved with the new reset() function.

Measuring line/s parsing peformance by running

go test -run StdLib -fast -skip "syntax/(scanner|source)\.go"

(best of 5 runs on "quiet" MacBook Pro, 3.3GHz Dual-Core i7, 16GB RAM,
OS X 10.15.3) before and after shows consistently 3-5% improvement of
line parsing speed:

old: parsed 1788155 lines (3969 files) in 1.255520307s (1424234 lines/s)
new: parsed 1788155 lines (3969 files) in 1.213197037s (1473919 lines/s)

(scanner.go and parser.go are skipped because this CL changed those files.)

Change-Id: Ida947f4b538d42eb2d2349062c69edb6c9e5ca66
Reviewed-on: https://go-review.googlesource.com/c/go/+/221603
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2 contributors

Users who have contributed to this file

@griesemer @mdempsky
218 lines (192 sloc) 5.71 KB
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file implements source, a buffered rune reader
// specialized for scanning Go code: Reading
// ASCII characters, maintaining current (line, col)
// position information, and recording of the most
// recently read source segment are highly optimized.
// This file is self-contained (go tool compile source.go
// compiles) and thus could be made into its own package.
package syntax
import (
"io"
"unicode/utf8"
)
// The source buffer is accessed using three indices b (begin),
// r (read), and e (end):
//
// - If b >= 0, it points to the beginning of a segment of most
// recently read characters (typically a Go literal).
//
// - r points to the byte immediately following the most recently
// read character ch, which starts at r-chw.
//
// - e points to the byte immediately following the last byte that
// was read into the buffer.
//
// The buffer content is terminated at buf[e] with the sentinel
// character utf8.RuneSelf. This makes it possible to test for
// the common case of ASCII characters with a single 'if' (see
// nextch method).
//
// +------ content in use -------+
// v v
// buf [...read...|...segment...|ch|...unread...|s|...free...]
// ^ ^ ^ ^
// | | | |
// b r-chw r e
//
// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
type source struct {
in io.Reader
errh func(line, col uint, msg string)
buf []byte // source buffer
ioerr error // pending I/O error, or nil
b, r, e int // buffer indices (see comment above)
line, col uint // source position of ch (0-based)
ch rune // most recently read character
chw int // width of ch
}
const sentinel = utf8.RuneSelf
func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
s.in = in
s.errh = errh
if s.buf == nil {
s.buf = make([]byte, nextSize(0))
}
s.buf[0] = sentinel
s.ioerr = nil
s.b, s.r, s.e = -1, 0, 0
s.line, s.col = 0, 0
s.ch = ' '
s.chw = 0
}
// starting points for line and column numbers
const linebase = 1
const colbase = 1
// pos returns the (line, col) source position of s.ch.
func (s *source) pos() (line, col uint) {
return linebase + s.line, colbase + s.col
}
// error reports the error msg at source position s.pos().
func (s *source) error(msg string) {
line, col := s.pos()
s.errh(line, col, msg)
}
// start starts a new active source segment (including s.ch).
// As long as stop has not been called, the active segment's
// bytes (excluding s.ch) may be retrieved by calling segment.
func (s *source) start() { s.b = s.r - s.chw }
func (s *source) stop() { s.b = -1 }
func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
// rewind rewinds the scanner's read position and character s.ch
// to the start of the currently active segment, which must not
// contain any newlines (otherwise position information will be
// incorrect). Currently, rewind is only needed for handling the
// source sequence ".."; it must not be called outside an active
// segment.
func (s *source) rewind() {
// ok to verify precondition - rewind is rarely called
if s.b < 0 {
panic("no active segment")
}
s.col -= uint(s.r - s.b)
s.r = s.b
s.nextch()
}
func (s *source) nextch() {
redo:
s.col += uint(s.chw)
if s.ch == '\n' {
s.line++
s.col = 0
}
// fast common case: at least one ASCII character
if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
s.r++
s.chw = 1
if s.ch == 0 {
s.error("invalid NUL character")
goto redo
}
return
}
// slower general case: add more bytes to buffer if we don't have a full rune
for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
s.fill()
}
// EOF
if s.r == s.e {
if s.ioerr != io.EOF {
// ensure we never start with a '/' (e.g., rooted path) in the error message
s.error("I/O error: " + s.ioerr.Error())
s.ioerr = nil
}
s.ch = -1
s.chw = 0
return
}
s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
s.r += s.chw
if s.ch == utf8.RuneError && s.chw == 1 {
s.error("invalid UTF-8 encoding")
goto redo
}
// BOM's are only allowed as the first character in a file
const BOM = 0xfeff
if s.ch == BOM {
if s.line > 0 || s.col > 0 {
s.error("invalid BOM in the middle of the file")
}
goto redo
}
}
// fill reads more source bytes into s.buf.
// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
func (s *source) fill() {
// determine content to preserve
b := s.r
if s.b >= 0 {
b = s.b
s.b = 0 // after buffer has grown or content has been moved down
}
content := s.buf[b:s.e]
// grow buffer or move content down
if len(content)*2 > len(s.buf) {
s.buf = make([]byte, nextSize(len(s.buf)))
copy(s.buf, content)
} else if b > 0 {
copy(s.buf, content)
}
s.r -= b
s.e -= b
// read more data: try a limited number of times
for i := 0; i < 10; i++ {
var n int
n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
if n < 0 {
panic("negative read") // incorrect underlying io.Reader implementation
}
if n > 0 || s.ioerr != nil {
s.e += n
s.buf[s.e] = sentinel
return
}
// n == 0
}
s.buf[s.e] = sentinel
s.ioerr = io.ErrNoProgress
}
// nextSize returns the next bigger size for a buffer of a given size.
func nextSize(size int) int {
const min = 4 << 10 // 4K: minimum buffer size
const max = 1 << 20 // 1M: maximum buffer size which is still doubled
if size < min {
return min
}
if size <= max {
return size << 1
}
return size + max
}