lexer/lexer.go

// Package lexer provides a handlebars tokenizer.
package lexer

import (
	"fmt"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// References:
//   - https://github.com/wycats/handlebars.js/blob/master/src/handlebars.l
//   - https://github.com/golang/go/blob/master/src/text/template/parse/lex.go

const (
	// Mustaches detection
	escapedEscapedOpenMustache  = "\\\\{{"
	escapedOpenMustache         = "\\{{"
	openMustache                = "{{"
	closeMustache               = "}}"
	closeStripMustache          = "~}}"
	closeUnescapedStripMustache = "}~}}"
)

const eof = -1

// lexFunc represents a function that returns the next lexer function.
type lexFunc func(*Lexer) lexFunc

// Lexer is a lexical analyzer.
type Lexer struct {
	input    string     // input to scan
	name     string     // lexer name, used for testing purpose
	tokens   chan Token // channel of scanned tokens
	nextFunc lexFunc    // the next function to execute

	pos   int // current byte position in input string
	line  int // current line position in input string
	width int // size of last rune scanned from input string
	start int // start position of the token we are scanning

	// the shameful contextual properties needed because `nextFunc` is not enough
	closeComment *regexp.Regexp // regexp to scan close of current comment
	rawBlock     bool           // are we parsing a raw block content ?
}

var (
	lookheadChars        = `[\s` + regexp.QuoteMeta("=~}/)|") + `]`
	literalLookheadChars = `[\s` + regexp.QuoteMeta("~})") + `]`

	// characters not allowed in an identifier
	unallowedIDChars = " \n\t!\"#%&'()*+,./;<=>@[\\]^`{|}~"

	// regular expressions
	rID                  = regexp.MustCompile(`^[^` + regexp.QuoteMeta(unallowedIDChars) + `]+`)
	rDotID               = regexp.MustCompile(`^\.` + lookheadChars)
	rTrue                = regexp.MustCompile(`^true` + literalLookheadChars)
	rFalse               = regexp.MustCompile(`^false` + literalLookheadChars)
	rOpenRaw             = regexp.MustCompile(`^\{\{\{\{`)
	rCloseRaw            = regexp.MustCompile(`^\}\}\}\}`)
	rOpenEndRaw          = regexp.MustCompile(`^\{\{\{\{/`)
	rOpenEndRawLookAhead = regexp.MustCompile(`\{\{\{\{/`)
	rOpenUnescaped       = regexp.MustCompile(`^\{\{~?\{`)
	rCloseUnescaped      = regexp.MustCompile(`^\}~?\}\}`)
	rOpenBlock           = regexp.MustCompile(`^\{\{~?#`)
	rOpenEndBlock        = regexp.MustCompile(`^\{\{~?/`)
	rOpenPartial         = regexp.MustCompile(`^\{\{~?>`)
	// {{^}} or {{else}}
	rInverse          = regexp.MustCompile(`^(\{\{~?\^\s*~?\}\}|\{\{~?\s*else\s*~?\}\})`)
	rOpenInverse      = regexp.MustCompile(`^\{\{~?\^`)
	rOpenInverseChain = regexp.MustCompile(`^\{\{~?\s*else`)
	// {{ or {{&
	rOpen            = regexp.MustCompile(`^\{\{~?&?`)
	rClose           = regexp.MustCompile(`^~?\}\}`)
	rOpenBlockParams = regexp.MustCompile(`^as\s+\|`)
	// {{!--  ... --}}
	rOpenCommentDash  = regexp.MustCompile(`^\{\{~?!--\s*`)
	rCloseCommentDash = regexp.MustCompile(`^\s*--~?\}\}`)
	// {{! ... }}
	rOpenComment  = regexp.MustCompile(`^\{\{~?!\s*`)
	rCloseComment = regexp.MustCompile(`^\s*~?\}\}`)
)

// Scan scans given input.
//
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
func Scan(input string) *Lexer {
	return scanWithName(input, "")
}

// scanWithName scans given input, with a name used for testing
//
// Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
func scanWithName(input string, name string) *Lexer {
	result := &Lexer{
		input:  input,
		name:   name,
		tokens: make(chan Token),
		line:   1,
	}

	go result.run()

	return result
}

// Collect scans and collect all tokens.
//
// This should be used for debugging purpose only. You should use Scan() and lexer.NextToken() functions instead.
func Collect(input string) []Token {
	var result []Token

	l := Scan(input)
	for {
		token := l.NextToken()
		result = append(result, token)

		if token.Kind == TokenEOF || token.Kind == TokenError {
			break
		}
	}

	return result
}

// NextToken returns the next scanned token.
func (l *Lexer) NextToken() Token {
	result := <-l.tokens

	return result
}

// run starts lexical analysis
func (l *Lexer) run() {
	for l.nextFunc = lexContent; l.nextFunc != nil; {
		l.nextFunc = l.nextFunc(l)
	}
}

// next returns next character from input, or eof of there is nothing left to scan
func (l *Lexer) next() rune {
	if l.pos >= len(l.input) {
		l.width = 0
		return eof
	}

	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
	l.width = w
	l.pos += l.width

	return r
}

func (l *Lexer) produce(kind TokenKind, val string) {
	l.tokens <- Token{kind, val, l.start, l.line}

	// scanning a new token
	l.start = l.pos

	// update line number
	l.line += strings.Count(val, "\n")
}

// emit emits a new scanned token
func (l *Lexer) emit(kind TokenKind) {
	l.produce(kind, l.input[l.start:l.pos])
}

// emitContent emits scanned content
func (l *Lexer) emitContent() {
	if l.pos > l.start {
		l.emit(TokenContent)
	}
}

// emitString emits a scanned string
func (l *Lexer) emitString(delimiter rune) {
	str := l.input[l.start:l.pos]

	// replace escaped delimiters
	str = strings.Replace(str, "\\"+string(delimiter), string(delimiter), -1)

	l.produce(TokenString, str)
}

// peek returns but does not consume the next character in the input
func (l *Lexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

// backup steps back one character
//
// WARNING: Can only be called once per call of next
func (l *Lexer) backup() {
	l.pos -= l.width
}

// ignoreskips all characters that have been scanned up to current position
func (l *Lexer) ignore() {
	l.start = l.pos
}

// accept scans the next character if it is included in given string
func (l *Lexer) accept(valid string) bool {
	if strings.IndexRune(valid, l.next()) >= 0 {
		return true
	}

	l.backup()

	return false
}

// acceptRun scans all following characters that are part of given string
func (l *Lexer) acceptRun(valid string) {
	for strings.IndexRune(valid, l.next()) >= 0 {
	}

	l.backup()
}

// errorf emits an error token
func (l *Lexer) errorf(format string, args ...interface{}) lexFunc {
	l.tokens <- Token{TokenError, fmt.Sprintf(format, args...), l.start, l.line}
	return nil
}

// isString returns true if content at current scanning position starts with given string
func (l *Lexer) isString(str string) bool {
	return strings.HasPrefix(l.input[l.pos:], str)
}

// findRegexp returns the first string from current scanning position that matches given regular expression
func (l *Lexer) findRegexp(r *regexp.Regexp) string {
	return r.FindString(l.input[l.pos:])
}

// indexRegexp returns the index of the first string from current scanning position that matches given regular expression
//
// It returns -1 if not found
func (l *Lexer) indexRegexp(r *regexp.Regexp) int {
	loc := r.FindStringIndex(l.input[l.pos:])
	if loc == nil {
		return -1
	}
	return loc[0]
}

// lexContent scans content (ie: not between mustaches)
func lexContent(l *Lexer) lexFunc {
	var next lexFunc

	if l.rawBlock {
		if i := l.indexRegexp(rOpenEndRawLookAhead); i != -1 {
			// {{{{/
			l.rawBlock = false
			l.pos += i

			next = lexOpenMustache
		} else {
			return l.errorf("Unclosed raw block")
		}
	} else if l.isString(escapedEscapedOpenMustache) {
		// \\{{

		// emit content with only one escaped escape
		l.next()
		l.emitContent()

		// ignore second escaped escape
		l.next()
		l.ignore()

		next = lexContent
	} else if l.isString(escapedOpenMustache) {
		// \{{
		next = lexEscapedOpenMustache
	} else if str := l.findRegexp(rOpenCommentDash); str != "" {
		// {{!--
		l.closeComment = rCloseCommentDash

		next = lexComment
	} else if str := l.findRegexp(rOpenComment); str != "" {
		// {{!
		l.closeComment = rCloseComment

		next = lexComment
	} else if l.isString(openMustache) {
		// {{
		next = lexOpenMustache
	}

	if next != nil {
		// emit scanned content
		l.emitContent()

		// scan next token
		return next
	}

	// scan next rune
	if l.next() == eof {
		// emit scanned content
		l.emitContent()

		// this is over
		l.emit(TokenEOF)
		return nil
	}

	// continue content scanning
	return lexContent
}

// lexEscapedOpenMustache scans \{{
func lexEscapedOpenMustache(l *Lexer) lexFunc {
	// ignore escape character
	l.next()
	l.ignore()

	// scan mustaches
	for l.peek() == '{' {
		l.next()
	}

	return lexContent
}

// lexOpenMustache scans {{
func lexOpenMustache(l *Lexer) lexFunc {
	var str string
	var tok TokenKind

	nextFunc := lexExpression

	if str = l.findRegexp(rOpenEndRaw); str != "" {
		tok = TokenOpenEndRawBlock
	} else if str = l.findRegexp(rOpenRaw); str != "" {
		tok = TokenOpenRawBlock
		l.rawBlock = true
	} else if str = l.findRegexp(rOpenUnescaped); str != "" {
		tok = TokenOpenUnescaped
	} else if str = l.findRegexp(rOpenBlock); str != "" {
		tok = TokenOpenBlock
	} else if str = l.findRegexp(rOpenEndBlock); str != "" {
		tok = TokenOpenEndBlock
	} else if str = l.findRegexp(rOpenPartial); str != "" {
		tok = TokenOpenPartial
	} else if str = l.findRegexp(rInverse); str != "" {
		tok = TokenInverse
		nextFunc = lexContent
	} else if str = l.findRegexp(rOpenInverse); str != "" {
		tok = TokenOpenInverse
	} else if str = l.findRegexp(rOpenInverseChain); str != "" {
		tok = TokenOpenInverseChain
	} else if str = l.findRegexp(rOpen); str != "" {
		tok = TokenOpen
	} else {
		// this is rotten
		panic("Current pos MUST be an opening mustache")
	}

	l.pos += len(str)
	l.emit(tok)

	return nextFunc
}

// lexCloseMustache scans }} or ~}}
func lexCloseMustache(l *Lexer) lexFunc {
	var str string
	var tok TokenKind

	if str = l.findRegexp(rCloseRaw); str != "" {
		// }}}}
		tok = TokenCloseRawBlock
	} else if str = l.findRegexp(rCloseUnescaped); str != "" {
		// }}}
		tok = TokenCloseUnescaped
	} else if str = l.findRegexp(rClose); str != "" {
		// }}
		tok = TokenClose
	} else {
		// this is rotten
		panic("Current pos MUST be a closing mustache")
	}

	l.pos += len(str)
	l.emit(tok)

	return lexContent
}

// lexExpression scans inside mustaches
func lexExpression(l *Lexer) lexFunc {
	// search close mustache delimiter
	if l.isString(closeMustache) || l.isString(closeStripMustache) || l.isString(closeUnescapedStripMustache) {
		return lexCloseMustache
	}

	// search some patterns before advancing scanning position

	// "as |"
	if str := l.findRegexp(rOpenBlockParams); str != "" {
		l.pos += len(str)
		l.emit(TokenOpenBlockParams)
		return lexExpression
	}

	// ..
	if l.isString("..") {
		l.pos += len("..")
		l.emit(TokenID)
		return lexExpression
	}

	// .
	if str := l.findRegexp(rDotID); str != "" {
		l.pos += len(".")
		l.emit(TokenID)
		return lexExpression
	}

	// true
	if str := l.findRegexp(rTrue); str != "" {
		l.pos += len("true")
		l.emit(TokenBoolean)
		return lexExpression
	}

	// false
	if str := l.findRegexp(rFalse); str != "" {
		l.pos += len("false")
		l.emit(TokenBoolean)
		return lexExpression
	}

	// let's scan next character
	switch r := l.next(); {
	case r == eof:
		return l.errorf("Unclosed expression")
	case isIgnorable(r):
		return lexIgnorable
	case r == '(':
		l.emit(TokenOpenSexpr)
	case r == ')':
		l.emit(TokenCloseSexpr)
	case r == '=':
		l.emit(TokenEquals)
	case r == '@':
		l.emit(TokenData)
	case r == '"' || r == '\'':
		l.backup()
		return lexString
	case r == '/' || r == '.':
		l.emit(TokenSep)
	case r == '|':
		l.emit(TokenCloseBlockParams)
	case r == '+' || r == '-' || (r >= '0' && r <= '9'):
		l.backup()
		return lexNumber
	case r == '[':
		return lexPathLiteral
	case strings.IndexRune(unallowedIDChars, r) < 0:
		l.backup()
		return lexIdentifier
	default:
		return l.errorf("Unexpected character in expression: '%c'", r)
	}

	return lexExpression
}

// lexComment scans {{!-- or {{!
func lexComment(l *Lexer) lexFunc {
	if str := l.findRegexp(l.closeComment); str != "" {
		l.pos += len(str)
		l.emit(TokenComment)

		return lexContent
	}

	if r := l.next(); r == eof {
		return l.errorf("Unclosed comment")
	}

	return lexComment
}

// lexIgnorable scans all following ignorable characters
func lexIgnorable(l *Lexer) lexFunc {
	for isIgnorable(l.peek()) {
		l.next()
	}
	l.ignore()

	return lexExpression
}

// lexString scans a string
func lexString(l *Lexer) lexFunc {
	// get string delimiter
	delim := l.next()
	var prev rune

	// ignore delimiter
	l.ignore()

	for {
		r := l.next()
		if r == eof || r == '\n' {
			return l.errorf("Unterminated string")
		}

		if (r == delim) && (prev != '\\') {
			break
		}

		prev = r
	}

	// remove end delimiter
	l.backup()

	// emit string
	l.emitString(delim)

	// skip end delimiter
	l.next()
	l.ignore()

	return lexExpression
}

// lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
// isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
// and "089" - but when it's wrong the input is invalid and the parser (via
// strconv) will notice.
//
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func lexNumber(l *Lexer) lexFunc {
	if !l.scanNumber() {
		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
	}
	if sign := l.peek(); sign == '+' || sign == '-' {
		// Complex: 1+2i. No spaces, must end in 'i'.
		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
		}
		l.emit(TokenNumber)
	} else {
		l.emit(TokenNumber)
	}
	return lexExpression
}

// scanNumber scans a number
//
// NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func (l *Lexer) scanNumber() bool {
	// Optional leading sign.
	l.accept("+-")

	// Is it hex?
	digits := "0123456789"

	if l.accept("0") && l.accept("xX") {
		digits = "0123456789abcdefABCDEF"
	}

	l.acceptRun(digits)

	if l.accept(".") {
		l.acceptRun(digits)
	}

	if l.accept("eE") {
		l.accept("+-")
		l.acceptRun("0123456789")
	}

	// Is it imaginary?
	l.accept("i")

	// Next thing mustn't be alphanumeric.
	if isAlphaNumeric(l.peek()) {
		l.next()
		return false
	}

	return true
}

// lexIdentifier scans an ID
func lexIdentifier(l *Lexer) lexFunc {
	str := l.findRegexp(rID)
	if len(str) == 0 {
		// this is rotten
		panic("Identifier expected")
	}

	l.pos += len(str)
	l.emit(TokenID)

	return lexExpression
}

// lexPathLiteral scans an [ID]
func lexPathLiteral(l *Lexer) lexFunc {
	for {
		r := l.next()
		if r == eof || r == '\n' {
			return l.errorf("Unterminated path literal")
		}

		if r == ']' {
			break
		}
	}

	l.emit(TokenID)

	return lexExpression
}

// isIgnorable returns true if given character is ignorable (ie. whitespace of line feed)
func isIgnorable(r rune) bool {
	return r == ' ' || r == '\t' || r == '\n'
}

// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
//
// NOTE borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
func isAlphaNumeric(r rune) bool {
	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}