internal/lvgen/lvlexer.go

// Copyright 2017 The go-libvirt Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package lvgen

import (
	"fmt"
	"io"
	"io/ioutil"
	"strings"
	"unicode"
	"unicode/utf8"
)

// eof is returned by the lexer when there's no more input.
const eof = -1

// oneRuneTokens lists the runes the lexer will consider to be tokens when it
// finds them. These are returned to the parser using the integer value of their
// runes.
var oneRuneTokens = `{}[]<>(),=;:*`

var keywords = map[string]int{
	"hyper":    HYPER,
	"int":      INT,
	"short":    SHORT,
	"char":     CHAR,
	"bool":     BOOL,
	"case":     CASE,
	"const":    CONST,
	"default":  DEFAULT,
	"double":   DOUBLE,
	"enum":     ENUM,
	"float":    FLOAT,
	"opaque":   OPAQUE,
	"string":   STRING,
	"struct":   STRUCT,
	"switch":   SWITCH,
	"typedef":  TYPEDEF,
	"union":    UNION,
	"unsigned": UNSIGNED,
	"void":     VOID,
	"program":  PROGRAM,
	"version":  VERSION,
}

// item is a lexeme, or what the lexer returns to the parser.
type item struct {
	typ          int
	val          string
	line, column int
}

// String will display lexer items for humans to debug. There are some
// calculations here due to the way goyacc arranges token values; see the
// generated file y.go for an idea what's going on here, but the basic idea is
// that the lower token type values are reserved for single-rune tokens, which
// the lexer reports using the value of the rune itself. Everything else is
// allocated a range of type value up above all the possible single-rune values.
func (i item) String() string {
	tokType := i.typ
	if tokType >= yyPrivate {
		if tokType < yyPrivate+len(yyTok2) {
			tokType = yyTok2[tokType-yyPrivate]
		}
	}
	rv := fmt.Sprintf("%s %q %d:%d", yyTokname(tokType), i.val, i.line, i.column)
	return rv
}

// Lexer stores the state of this lexer.
type Lexer struct {
	input    string    // the string we're scanning.
	start    int       // start position of the item.
	pos      int       // current position in the input.
	line     int       // the current line (for error reporting).
	column   int       // current position within the current line.
	width    int       // width of the last rune scanned.
	items    chan item // channel of scanned lexer items (lexemes).
	lastItem item      // The last item the lexer handed the parser
}

// NewLexer will return a new lexer for the passed-in reader.
func NewLexer(rdr io.Reader) (*Lexer, error) {
	l := &Lexer{}

	b, err := ioutil.ReadAll(rdr)
	if err != nil {
		return nil, err
	}
	l.input = string(b)
	l.items = make(chan item)

	return l, nil
}

// Run starts the lexer, and should be called in a goroutine.
func (l *Lexer) Run() {
	for state := lexText; state != nil; {
		state = state(l)
	}
	close(l.items)
}

// emit returns a token to the parser.
func (l *Lexer) emit(t int) {
	l.items <- item{t, l.input[l.start:l.pos], l.line, l.column}
	l.start = l.pos
}

// Lex gets the next token.
func (l *Lexer) Lex(st *yySymType) int {
	s := <-l.items
	l.lastItem = s
	st.val = s.val
	return int(s.typ)
}

// Error is called by the parser when it finds a problem.
func (l *Lexer) Error(s string) {
	fmt.Printf("parse error at %d:%d: %v\n", l.lastItem.line+1, l.lastItem.column+1, s)
	fmt.Printf("error at %q\n", l.lastItem.val)
}

// errorf is used by the lexer to report errors. It inserts an ERROR token into
// the items channel, and sets the state to nil, which stops the lexer's state
// machine.
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
	l.items <- item{ERROR, fmt.Sprintf(format, args...), l.line, l.column}
	return nil
}

// next returns the rune at the current location, and advances to the next rune
// in the input.
func (l *Lexer) next() (r rune) {
	if l.pos >= len(l.input) {
		l.width = 0
		return eof
	}
	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
	l.pos += l.width
	l.column++
	if r == '\n' {
		l.line++
		l.column = 0
	}
	return r
}

// ignore discards the current text from start to pos.
func (l *Lexer) ignore() {
	l.start = l.pos
}

// backup moves back one character, but can only be called once per next() call.
func (l *Lexer) backup() {
	l.pos -= l.width
	if l.column > 0 {
		l.column--
	} else {
		l.line--
	}
	l.width = 0
}

// peek looks ahead at the next rune in the stream without consuming it.
func (l *Lexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

// accept will advance to the next rune if it's contained in the string of valid
// runes passed in by the caller.
func (l *Lexer) accept(valid string) bool {
	if strings.IndexRune(valid, l.next()) >= 0 {
		return true
	}
	l.backup()
	return false
}

// acceptRun advances over a number of valid runes, stopping as soon as it hits
// one not on the list.
func (l *Lexer) acceptRun(valid string) {
	for strings.IndexRune(valid, l.next()) >= 0 {
	}
	l.backup()
}

// keyword checks whether the current lexeme is a keyword or not. If so it
// returns the keyword's token id, otherwise it returns IDENTIFIER.
func (l *Lexer) keyword() int {
	ident := l.input[l.start:l.pos]
	tok, ok := keywords[ident]
	if ok == true {
		return int(tok)
	}
	return IDENTIFIER
}

// oneRuneToken determines whether a rune is a token. If so it returns the token
// id and true, otherwise it returns false.
func (l *Lexer) oneRuneToken(r rune) (int, bool) {
	if strings.IndexRune(oneRuneTokens, r) >= 0 {
		return int(r), true
	}

	return 0, false
}

// State functions
type stateFn func(*Lexer) stateFn

// lexText is the master lex routine. The lexer is started in this state.
func lexText(l *Lexer) stateFn {
	for {
		if strings.HasPrefix(l.input[l.pos:], "/*") {
			return lexBlockComment
		}
		r := l.next()
		if r == eof {
			break
		}
		if unicode.IsSpace(r) {
			l.ignore()
			return lexText
		}
		if l.column == 1 && r == '%' {
			l.backup()
			return lexDirective
		}
		if unicode.IsLetter(r) {
			l.backup()
			return lexIdent
		}
		if unicode.IsNumber(r) || r == '-' {
			l.backup()
			return lexNumber
		}
		if t, isToken := l.oneRuneToken(r); isToken == true {
			l.emit(t)
		}
	}

	return nil
}

// lexBlockComment is used when we find a comment marker '/*' in the input.
func lexBlockComment(l *Lexer) stateFn {
	// Double star is used only at the start of metadata comments
	metadataComment := strings.HasPrefix(l.input[l.pos:], "/**")
	for {
		if strings.HasPrefix(l.input[l.pos:], "*/") {
			// Found the end. Advance past the '*/' and discard the comment body
			// unless it's a metadata comment
			l.next()
			l.next()
			if metadataComment {
				l.emit(METADATACOMMENT)
			} else {
				l.ignore()
			}
			return lexText
		}
		if l.next() == eof {
			return l.errorf("unterminated block comment")
		}
	}
}

// lexIdent handles identifiers.
func lexIdent(l *Lexer) stateFn {
	for {
		r := l.next()
		if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' {
			continue
		}
		l.backup()
		break
	}
	// We may have a keyword, so check for that before emitting.
	l.emit(l.keyword())

	return lexText
}

// lexNumber handles decimal and hexadecimal numbers. Decimal numbers may begin
// with a '-'; hex numbers begin with '0x' and do not accept leading '-'.
func lexNumber(l *Lexer) stateFn {
	// Leading '-' is ok
	digits := "0123456789"
	neg := l.accept("-")
	if !neg {
		// allow '0x' for hex numbers, as long as there's not a leading '-'.
		r := l.peek()
		if r == '0' {
			l.next()
			if l.accept("x") {
				digits = "0123456789ABCDEFabcdef"
			}
		}
	}
	// followed by any number of digits
	l.acceptRun(digits)
	r := l.peek()
	if unicode.IsLetter(r) {
		l.next()
		return l.errorf("invalid number: %q", l.input[l.start:l.pos])
	}
	l.emit(CONSTANT)
	return lexText
}

// lexDirective handles lines beginning with '%'. These are used to emit C code
// directly to the output file. For now we're ignoring them, but some of the
// constants in the protocol file do depend on values from #included header
// files, so that may need to change.
func lexDirective(l *Lexer) stateFn {
	for {
		r := l.next()
		if r == '\n' {
			l.ignore()
			return lexText
		}
		if r == eof {
			return l.errorf("unterminated directive")
		}
	}
}