-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
136 lines (109 loc) · 3.26 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
from collections import namedtuple
class TokenDef:
def __init__(self, name, pattern, value_filter):
self.name = name
self.pattern = pattern
self.value_filter = value_filter
def __repr__(self):
return 'TokenType.' + self.name
"""class TokenType:
def plus(self):
self.name = 'plus'
self.pattern = '+'
self.value_filter = None
def minus(self):
self.name = 'minus'
self.pattern = '-'
self.value_filter = None
def asterisk(self):
self.name = 'asterisk'
self.pattern = '*'
self.value_filter = None
def slash(self):
self.name = 'slash'
self.pattern = '/'
self.value_filter = None
# more punctuation
def left_paren(self):
self.name = 'left-paren'
self.pattern = '('
self.value_filter = None
def right_paren(self):
self.name = 'right_paren'
self.pattern = '+'
self.value_filter = None
#more tokens
def integer(self):
self.name = 'integer'
self.pattern = re.compile('[0-9]+')
self.value_filter = 'int'
def whitespace(self):
self.name = 'whitespace'
self.pattern = re.compile('[\t]+')
self.value_filter = None
"""
class TokenType:
_defs = [
#operators
TokenDef('plus', '+', None),
TokenDef('minus', '-', None),
TokenDef('asterist', '*', None),
TokenDef('slash', '/', None),
#other punctuation
TokenDef('left_paren', '(', None),
TokenDef('right_paren', ')', None),
#more tokens
TokenDef('integer', re.compile('[0-9]+'), int),
TokenDef('whitespace', re.compile('[\t]+'), None),
TokenDef('print', 'print', None)
]
for def_ in TokenType._defs:
setattr(TokenType, def_.name, def_)
Token = namedtuple('Token', ('type', 'value', 'slice'))
def first_token(text, start = 0):
#print('start: ',start)
match_text = text[start:]
token = None
token_text = None
for tok in TokenType._defs:
name = tok.name
pattern = tok.pattern
value_filter = tok.value_filter
#checks
if pattern is None:
continue
elif isinstance(pattern, str):
if not match_text.startswith(pattern):
continue
match_value = pattern
else:
match = pattern.match(match_text)
if not match:
continue
match_value = match.group(0)
if token_text is not None and len(token_text)>=len(match_value):
continue
token_text = match_value
#if value_filter is not None:
# match = value_filter(match_text)
token = Token(tok, match_value, slice(start, start + len(token_text)))
return token
def lex_raw(text):
start = 0
while True:
if start>=len(text):
break;
token = first_token(text, start)
yield token
start = token.slice.stop
def lex_skip_whitespace(text):
for token in lex_raw(text):
if token.type is TokenType.whitespace:
continue
yield token
lex = lex_skip_whitespace
if __name__ == '__main__':
p = list(lex('print(68+778+(8*3)/25-19)'))
for i in p:
print(i)