/
token.go
143 lines (120 loc) 路 3.35 KB
/
token.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package q
import (
"fmt"
"regexp"
)
type TokenKind string
const (
// Special
TokenEOF = TokenKind("EOF")
// Ignored
TokenWhitespace = TokenKind("whitespace")
// Words
TokenAccessor = TokenKind("accessor")
TokenWord = TokenKind("word")
TokenNumber = TokenKind("number")
TokenString = TokenKind("string")
// Operators
TokenPipe = TokenKind("|")
TokenSemiColon = TokenKind(";")
TokenQuestionMark = TokenKind("?")
TokenOpenBracket = TokenKind("(")
TokenCloseBracket = TokenKind(")")
TokenOpenCurly = TokenKind("{")
TokenCloseCurly = TokenKind("}")
TokenColon = TokenKind(":")
TokenComma = TokenKind(",")
TokenEqual = TokenKind("=")
TokenNot = TokenKind("!")
TokenGreaterThan = TokenKind(">")
TokenLessThan = TokenKind("<")
)
var TokenRegexp = []struct {
re *regexp.Regexp
kind TokenKind
}{
{regexp.MustCompile(`^\s+$`), TokenWhitespace},
{regexp.MustCompile(`^\|$`), TokenPipe},
{regexp.MustCompile(`^;$`), TokenSemiColon},
{regexp.MustCompile(`^\?$`), TokenQuestionMark},
{regexp.MustCompile(`^\($`), TokenOpenBracket},
{regexp.MustCompile(`^\)$`), TokenCloseBracket},
{regexp.MustCompile(`^\{$`), TokenOpenCurly},
{regexp.MustCompile(`^\}$`), TokenCloseCurly},
{regexp.MustCompile(`^:$`), TokenColon},
{regexp.MustCompile(`^,$`), TokenComma},
{regexp.MustCompile(`^!$`), TokenNot},
{regexp.MustCompile(`^=$`), TokenEqual},
{regexp.MustCompile(`^>$`), TokenGreaterThan},
{regexp.MustCompile(`^<$`), TokenLessThan},
{regexp.MustCompile(`^".*"$`), TokenString},
{regexp.MustCompile(`^\.[a-zA-Z0-9_]*$`), TokenAccessor},
{regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`), TokenWord},
{regexp.MustCompile(`^[0-9]+$`), TokenNumber},
}
type Token struct {
Kind TokenKind
Value string
}
type Tokenizer struct{}
func NewTokenizer() *Tokenizer {
return &Tokenizer{}
}
func (t *Tokenizer) TokenizeString(s string) *Tokens {
tokens := []Token{}
buf := []byte{}
Begin:
for i := 0; i < len(s); i++ {
buf = append(buf, s[i])
// Try to match a token. At this point it may be possible to match
// multiple tokens which is why its important that we check them in
// order. The first match always wins.
for _, test := range TokenRegexp {
if test.re.Match(buf) {
// Now attempt to consume as many characters as we can that
// still match the regexp.
for ; i+1 < len(s) && test.re.Match(append(buf, s[i+1])); i++ {
buf = append(buf, s[i+1])
}
if test.kind != TokenWhitespace {
token := Token{
Kind: test.kind,
Value: string(buf),
}
tokens = append(tokens, token)
}
buf = nil
continue Begin
}
}
}
return &Tokens{tokens, 0}
}
type Tokens struct {
Tokens []Token
Position int
}
func (t *Tokens) Consume(expected ...TokenKind) (tokens []Token, err error) {
// Attempt to consume the tokens. If something goes wrong the Position is
// not moved forward and an error is returned.
originalPosition := t.Position
for _, kind := range expected {
p := Token{Kind: TokenEOF}
if t.Position < len(t.Tokens) {
p = t.Tokens[t.Position]
}
if p.Kind == kind {
tokens = append(tokens, p)
t.Position++
} else {
t.Position = originalPosition
err = fmt.Errorf("expected %s but found %s", kind, p.Kind)
}
}
return
}
func (t *Tokens) Rollback(position int, err *error) {
if *err != nil {
t.Position = position
}
}