/
parser.rb
281 lines (242 loc) · 6.7 KB
/
parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
require File.dirname(__FILE__) + '/lexer'
require File.dirname(__FILE__) + '/expr'
class ParseError < Exception; end
# Recursive-descent parser.
#
# Since Scheme code is also data, this parser mimics the (read) procedure
# and reads source code into Scheme expressions (internal data
# representation suitable for further analysis).
class SchemeParser
attr_accessor :lexer, :text, :cur_token
def initialize()
@lexer = SchemeLexer.new()
clear()
end
# Given a string with Scheme source code, parses it into a list of
# expression objects.
def parse(text)
@text = text
@lexer.input(@text)
next_token
parse_file
end
def clear()
@text = ''
@cur_token = nil
end
# Convert a lexing position (offset from start of text) into a
# coordinate [line %s, column %s].
def pos2coord(pos)
# Count the amount of newlines between the beginning of the parsed
# text and pos. Then, count the column as an offset from the last
# newline
#
num_newlines = @text[0..pos].scan(%r{\n}).size
line_offset = @text[0..pos].rindex("\n")
line_offset = 0 if line_offset < 0
"[line #{num_newlines + 1}, column #{pos - line_offset}]"
end
private
def parse_error(msg, token=nil)
token = token or cur_token
if token
coord = pos2coord(token.pos)
raise ParseError.new("#{msg} #{coord}")
else
raise ParseError.new(msg)
end
end
def next_token
while true
@cur_token = @lexer.token()
if @cur_token.nil? or @cur_token._type != 'COMMENT'
break
end
end
rescue LexerError => e
raise ParseError.new("syntax error at #{pos2coord(lexerr.pos)}")
end
# The 'match' primitive of RD parsers.
# * Verifies that the current token is of the given type
# * Returns the value of the current token
# * Reads in the next token
def match(_type)
if @cur_token._type == _type
val = @cur_token.value
next_token()
return val
else
parse_error("Unmatched #{_type} (found #{@cur_token.type})")
end
end
##
## Recursive parsing rules. The top-level is _parse_file, which expects
## a sequence of Scheme expressions. The rest of the rules follow section
## 7.1.2 of R5RS with some re-ordering for programming convenience.
##
def parse_file()
datum_list = []
while @cur_token
datum_list << datum()
end
datum_list
end
def datum()
# list
if @cur_token._type == 'LPAREN'
return list()
# abbreviation
elsif @cur_token._type == 'QUOTE'
return abbreviation()
# simple datum
else
return simple_datum()
end
end
def simple_datum()
retval = nil
if @cur_token._type == 'BOOLEAN'
retval = Boolean.new(@cur_token.value == '#t')
elsif @cur_token._type == 'NUMBER'
base = 10
num_str = @cur_token.value.to_s
if num_str[0] == '#'
if num_str[1] == 'x'
base = 16
elsif num_str[1] == 'o'
base = 8
elsif num_str[1] == 'b'
base = 2
end
num_str = num_str[2..-1]
end
begin
retval = Number.new(num_str.to_i(base))
rescue ValueError => e
parse_error('Invalid number')
end
elsif @cur_token._type == 'ID'
retval = Sym.new(@cur_token.value)
elsif @cur_token._type == 'STRING'
retval = Sstring.new(@cur_token.value)
else
parse_error("Unexpected token '#{@cur_token.value}'")
end
next_token()
retval
end
def list()
# Algorithm:
#
# 1. First parse all sub-datums into a sequential Python list.
# 2. Convert this list into nested Pair objects
#
# To handle the dot ('.'), dot_idx keeps track of the index in lst
# where the dot was specified.
#
match('LPAREN')
lst = []
dot_idx = -1
while true
if not @cur_token
parse_error('Unmatched parentheses at end of input')
elsif @cur_token._type == 'RPAREN'
break
elsif @cur_token._type == 'ID' and @cur_token.value == '.'
if dot_idx > 0
parse_error('Invalid usage of "."')
end
dot_idx = lst.size
match('ID')
else
lst << datum()
end
end
# Figure out whether we have a dotted list and whether the dot was
# placed correctly
#
dotted_end = false
if dot_idx > 0
if dot_idx == lst.size - 1
dotted_end = true
else
parse_error('Invalid location for "." in list')
end
end
match('RPAREN')
cur_cdr = nil
if dotted_end
cur_cdr = lst[-1]
lst = lst[0..-2]
else
cur_cdr = nil
end
lst.reverse.each do |datum|
cur_cdr = Pair.new(datum, cur_cdr)
end
cur_cdr
end
def abbreviation()
quotepos = @cur_token.pos
match('QUOTE')
datum = datum()
Pair.new(Sym.new('quote'), Pair.new(datum, nil))
end
end
# Partial Scheme lexer based on R5RS 7.1.1 (Lexical structure).
class SchemeLexer < Lexer
def initialize
rules = lexing_rules()
super(rules, true)
end
def lexing_rules()
# Regex helpers
#
digit_2 = %r{[0-1]}
digit_8 = %r{[0-7]}
digit_10 = %r{[0-9]}
digit_16 = %r{[0-9A-Fa-f]}
radix_2 = %r{\#b}
radix_8 = %r{\#o}
radix_10 = %r{(\#d)?}
radix_16 = %r{\#x}
number = %r{(#{radix_2}#{digit_2}+|#{radix_8}#{digit_8}+|#{radix_10}#{digit_10}+|#{radix_16}#{digit_16}+)}
special_initial = '[!$%&*.:<=>?^_~]'
initial = '([a-zA-Z]|'+special_initial+')'
special_subsequent = '[+-.@]'
subsequent = "(#{initial}|#{digit_10}|#{special_subsequent})"
peculiar_identifier = '([+\-.]|\.\.\.)'
identifier = "(#{initial}#{subsequent}*|#{peculiar_identifier})"
special_initial = %r{#{special_initial}}
special_subsequent = %r{#{special_subsequent}}
peculiar_identifier = %r{#{peculiar_identifier}}
rules = [
[%r{;[^\n]*}, 'COMMENT'],
[%r{\#[tf]}, 'BOOLEAN'],
[number, 'NUMBER'],
[identifier, 'ID'],
[%r{\(}, 'LPAREN'],
[%r{\)}, 'RPAREN'],
[%r{\'}, 'QUOTE'],
[%r{\".*?\"}, 'STRING']
]
rules
end
end
#-------------------------------------------------------------------------------
if __FILE__ == $0
include Predicates
p = SchemeParser.new
res = p.parse %Q{
(define (double num)
(+ num num))
(write (double 12))
}
require 'pp'
p 'AST:'
pp res
puts
puts
puts "scheme repr:"
puts expr_repr(res)
end