-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
lexer.cr
375 lines (330 loc) · 7.79 KB
/
lexer.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
require "string_pool"
abstract class JSON::Lexer
def self.new(string : String)
StringBased.new(string)
end
def self.new(io : IO)
IOBased.new(io)
end
getter token : Token
property skip : Bool
def initialize
@token = Token.new
@line_number = 1
@column_number = 1
@buffer = IO::Memory.new
@string_pool = StringPool.new
@skip = false
@expects_object_key = false
end
private abstract def consume_string
private abstract def next_char_no_column_increment
private abstract def current_char
private abstract def number_start
private abstract def append_number_char
private abstract def number_string
def next_token
skip_whitespace
@token.line_number = @line_number
@token.column_number = @column_number
case current_char
when '\0'
@token.kind = :EOF
when '{'
next_char :begin_object
when '}'
next_char :end_object
when '['
next_char :begin_array
when ']'
next_char :end_array
when ','
next_char :comma
when ':'
next_char :colon
when 'f'
consume_false
when 'n'
consume_null
when 't'
consume_true
when '"'
@token.kind = :string
@skip ? consume_string_skip : consume_string
else
consume_number
end
@token
end
# Requests the next token where the parser expects a json
# object key. In this case the lexer tries to reuse the String
# instances by using a StringPool.
def next_token_expect_object_key
@expects_object_key = true
next_token
@expects_object_key = false
@token
end
private def skip_whitespace
while whitespace?(current_char)
if current_char == '\n'
@line_number += 1
@column_number = 0
end
next_char
end
end
private def whitespace?(char)
case char
when ' ', '\t', '\n', '\r'
true
else
false
end
end
private def consume_true
if next_char == 'r' && next_char == 'u' && next_char == 'e'
next_char
@token.kind = :true
else
unexpected_char
end
end
private def consume_false
if next_char == 'a' && next_char == 'l' && next_char == 's' && next_char == 'e'
next_char
@token.kind = :false
else
unexpected_char
end
end
private def consume_null
if next_char == 'u' && next_char == 'l' && next_char == 'l'
next_char
@token.kind = :null
else
unexpected_char
end
end
# Since we are skipping we don't care about a
# string's contents, so we just move forward.
private def consume_string_skip
while true
case next_char
when '\0'
raise "Unterminated string"
when '\\'
consume_string_escape_sequence
when '"'
next_char
break
else
if 0 <= current_char.ord < 32
unexpected_char
end
end
end
end
private def consume_string_with_buffer
consume_string_with_buffer { }
end
private def consume_string_with_buffer
@buffer.clear
yield
while true
case char = next_char
when '\0'
raise "Unterminated string"
when '\\'
@buffer << consume_string_escape_sequence
when '"'
next_char
break
else
if 0 <= current_char.ord < 32
unexpected_char
else
@buffer << char
end
end
end
if @expects_object_key
@token.string_value = @string_pool.get(@buffer)
else
@token.string_value = @buffer.to_s
end
end
private def consume_string_escape_sequence
case char = next_char
when '\\', '"', '/'
char
when 'b'
'\b'
when 'f'
'\f'
when 'n'
'\n'
when 'r'
'\r'
when 't'
'\t'
when 'u'
hexnum1 = read_hex_number
if hexnum1 > 0xD800 && hexnum1 < 0xDBFF
if next_char != '\\' || next_char != 'u'
raise "Unterminated UTF-16 sequence"
end
hexnum2 = read_hex_number
(0x10000 | (hexnum1 & 0x3FF) << 10 | (hexnum2 & 0x3FF)).chr
else
hexnum1.chr
end
else
raise "Unknown escape char: #{char}"
end
end
private def read_hex_number
hexnum = 0
4.times do
char = next_char
hexnum = (hexnum << 4) | (char.to_i?(16) || raise "Unexpected char in hex number: #{char.inspect}")
end
hexnum
end
private def consume_number
# TODO once overflow is the default the overflow custom logic can be refactored
number_start
integer = 0_i64
negative = false
digits = 0
if current_char == '-'
append_number_char
negative = true
next_char
end
case current_char
when '0'
append_number_char
next_char
case current_char
when '.'
consume_float(negative, integer, digits)
when 'e', 'E'
consume_exponent(negative, integer.to_f64, digits)
when '0'..'9'
unexpected_char
else
@token.kind = :int
@token.int_value = 0_i64
number_end
end
when '1'..'9'
digits = 1
append_number_char
integer = (current_char - '0').to_i64
char = next_char
while '0' <= char <= '9'
append_number_char
integer &*= 10
integer &+= char - '0'
digits += 1
char = next_char
end
case char
when '.'
consume_float(negative, integer, digits)
when 'e', 'E'
consume_exponent(negative, integer.to_f64, digits)
else
@token.kind = :int
@token.int_value = negative ? -integer : integer
number_end
end
else
unexpected_char
end
end
private def consume_float(negative, integer, digits)
# TODO once overflow is the default the overflow custom logic can be refactored
append_number_char
divisor = 1_u64
char = next_char
unless '0' <= char <= '9'
unexpected_char
end
while '0' <= char <= '9'
append_number_char
integer &*= 10
integer &+= char - '0'
divisor &*= 10
digits += 1
char = next_char
end
float = integer.to_f64 / divisor
if char == 'e' || char == 'E'
consume_exponent(negative, float, digits)
else
@token.kind = :float
# If there's a chance of overflow, we parse the raw string
if digits >= 18
@token.float_value = number_string.to_f64
else
@token.float_value = negative ? -float : float
end
number_end
end
end
private def consume_exponent(negative, float, digits)
# TODO once overflow is the default the overflow custom logic can be refactored
append_number_char
exponent = 0
negative_exponent = false
char = next_char
if char == '+'
append_number_char
char = next_char
elsif char == '-'
append_number_char
char = next_char
negative_exponent = true
end
if '0' <= char <= '9'
while '0' <= char <= '9'
append_number_char
exponent *= 10
exponent += char - '0'
char = next_char
end
else
unexpected_char
end
@token.kind = :float
exponent = -exponent if negative_exponent
float *= (10_f64 ** exponent)
# If there's a chance of overflow, we parse the raw string
if digits >= 18
@token.float_value = number_string.to_f64
else
@token.float_value = negative ? -float : float
end
number_end
end
private def next_char
@column_number += 1
next_char_no_column_increment
end
private def next_char(kind : Token::Kind)
@token.kind = kind
next_char
end
private def number_end
@token.raw_value = number_string
end
private def unexpected_char(char = current_char)
raise "Unexpected char '#{char}'"
end
private def raise(msg)
::raise ParseException.new(msg, @line_number, @column_number)
end
end
require "./lexer/*"