-
Notifications
You must be signed in to change notification settings - Fork 2
/
Applicative.hs
288 lines (258 loc) · 8.3 KB
/
Applicative.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
{-# LANGUAGE ScopedTypeVariables, DeriveDataTypeable, DeriveFunctor #-}
-- | For some background, see
-- <https://ro-che.info/articles/2015-01-02-lexical-analysis>
module Language.Lexer.Applicative
(
-- * Building a Lexer
Lexer(..)
, token
, whitespace
-- ** Building Recognizers
, Recognizer
, longest
, longestShortest
-- * Running a Lexer
, runLexer
-- ** Working with a token stream
, TokenStream(..)
, streamToList
, streamToEitherList
, LexicalError(..)
) where
import Text.Regex.Applicative
import Data.Loc
import Data.List
import Data.Typeable (Typeable)
import Data.Semigroup (Semigroup(..))
import Data.Function
import Control.Exception
----------------------------------------------------------------------
-- Lexer
----------------------------------------------------------------------
-- | A 'Lexer' specification consists of two recognizers: one for
-- meaningful tokens and one for whitespace and comments.
--
-- Although you can construct 'Lexer's directly, it is more convenient to
-- build them with 'token', 'whitespace', and the 'Monoid' instance like this:
--
-- @
-- myLexer :: 'Lexer' MyToken
-- myLexer = 'mconcat'
-- [ 'token' ('longest' myToken)
-- , 'whitespace' ('longest' myWhiteSpace)
-- , 'whitespace' ('longestShortest' myCommentPrefix myCommentSuffix)
-- ]
-- @
data Lexer tok = Lexer
{ lexerTokenRE :: Recognizer tok
, lexerWhitespaceRE :: Recognizer ()
}
deriving Functor
instance Semigroup (Lexer tok) where
Lexer t1 w1 <> Lexer t2 w2 = Lexer (t1 <> t2) (w1 <> w2)
instance Monoid (Lexer tok) where
mempty = Lexer mempty mempty
mappend = (<>)
-- | Build a lexer with the given token recognizer and no (i.e. 'mempty')
-- whitespace recognizer.
--
-- 'token' is a monoid homomorphism:
--
-- @'token' a '<>' 'token' b = 'token' (a '<>' b)@
token :: Recognizer tok -> Lexer tok
token r = Lexer r mempty
-- | Build a lexer with the given whitespace recognizer and no (i.e. 'mempty')
-- token recognizer.
--
-- 'whitespace' is a monoid homomorphism:
--
-- @'whitespace' a '<>' 'whitespace' b = 'whitespace' (a '<>' b)@
whitespace :: Recognizer a -> Lexer tok
whitespace r = Lexer mempty (() <$ r)
----------------------------------------------------------------------
-- Recognizer
----------------------------------------------------------------------
-- | A token recognizer
--
-- 'Recognizer' values are constructed by functions like 'longest' and
-- 'longestShortest', combined with `mappend`, and used by 'token' and
-- 'whitespace'.
--
-- When a recognizer returns without consuming any characters, a lexical
-- error is signaled.
newtype Recognizer tok = Recognizer (RE Char (RE Char tok))
deriving Functor
instance Semigroup (Recognizer tok) where
Recognizer r1 <> Recognizer r2 = Recognizer (r1 <|> r2)
instance Monoid (Recognizer tok) where
mempty = Recognizer empty
mappend = (<>)
-- | When scanning a next token, the regular expression will compete with
-- the other 'Recognizer's of its 'Lexer'. If it wins, its result
-- will become the next token.
--
-- 'longest' has the following properties:
--
-- * @'longest' (r1 '<|>' r2) = 'longest' r1 '<>' 'longest' r2@
--
-- * @'longest' r = 'longestShortest' r 'pure'@
longest
:: RE Char tok
-> Recognizer tok
longest re = longestShortest re pure
-- | This is a more sophisticated recognizer than 'longest'.
--
-- It recognizes a token consisting of a prefix and a suffix, where prefix
-- is chosen longest, and suffix is chosen shortest.
--
-- An example would be a C block comment
--
-- >/* comment text */
--
-- The naive
--
-- @'longest' ('string' "\/*" '*>' 'many' 'anySym' '*>' 'string' "*\/")@
--
-- doesn't work because it consumes too much: in
--
-- >/* xxx */ yyy /* zzz */
--
-- it will treat the whole line as a comment.
--
-- This is where 'longestShortest' comes in handy:
--
-- @
-- 'longestShortest'
-- ('string' "\/*")
-- (\\_ -> 'many' 'anySym' '*>' 'string' "*\/")
-- @
--
-- Operationally, the prefix regex first competes with other 'Recognizer's
-- for the longest match. If it wins, then the shortest match for the
-- suffix regex is found, and the two results are combined with the given
-- function to produce a token.
--
-- The two regular expressions combined must consume some input, or else
-- 'LexicalError' is thrown. However, any one of them may return without
-- consuming input.
--
-- \* * *
--
-- Once the prefix regex wins, the choice is committed; the suffix regex
-- must match or else a 'LexicalError' is thrown. Therefore,
--
-- @
-- 'longestShortest' pref suff1
-- '<>'
-- 'longestShortest' pref suff2
-- =
-- 'longestShortest' pref suff1
-- @
--
-- and is not the same as
--
-- @'longestShortest' pref (suff1 '<|>' suff2)@
--
-- The following holds, however:
--
-- @
-- 'longestShortest' pref1 suff
-- '<>'
-- 'longestShortest' pref2 suff
-- =
-- 'longestShortest' (pref1 '<|>' pref2) suff
-- @
longestShortest
:: RE Char pref -- ^ regex for the longest prefix
-> (pref -> RE Char tok) -- ^ regex for the shortest suffix
-> Recognizer tok
longestShortest prefRE suffRE =
Recognizer $
suffRE <$> prefRE
----------------------------------------------------------------------
-- Running a Lexer
----------------------------------------------------------------------
-- | The lexical error exception
data LexicalError = LexicalError !Pos
deriving (Eq, Typeable)
instance Show LexicalError where
show (LexicalError pos) = "Lexical error at " ++ displayPos pos
instance Exception LexicalError
-- | A stream of tokens
data TokenStream tok
= TsToken tok (TokenStream tok)
| TsEof
| TsError LexicalError
deriving (Eq, Functor, Show)
-- | Convert a 'TokenStream' to a list of tokens. Turn 'TsError' into
-- a runtime 'LexicalError' exception.
streamToList :: TokenStream tok -> [tok]
streamToList stream =
case stream of
TsToken t stream' -> t : streamToList stream'
TsEof -> []
TsError e -> throw e
-- | Convert a 'TokenStream' into either a token list or a 'LexicalError'.
-- This function may be occasionally useful, but in general its use is
-- discouraged because it needs to force the whole stream before returning
-- a result.
streamToEitherList :: TokenStream tok -> Either LexicalError [tok]
streamToEitherList =
sequence .
fix (\rec stream ->
case stream of
TsToken t stream' -> Right t : rec stream'
TsEof -> []
TsError e -> [Left e]
)
-- | Run a lexer on a string and produce a lazy stream of tokens
runLexer
:: forall tok.
Lexer tok -- ^ lexer specification
-> String -- ^ source file name (used in locations)
-> String -- ^ source text
-> TokenStream (L tok)
runLexer (Lexer (Recognizer pToken) (Recognizer pJunk)) src = go . annotate src
where
go l = case l of
[] -> TsEof
s@((_, pos1, _):_) ->
let
-- last position in the stream
-- in this branch s is non-empty, so this is safe
last_pos :: Pos
last_pos = case last s of (_, p, _) -> p
in
case findLongestPrefix re s of
Nothing -> TsError (LexicalError pos1)
Just (shortest_re, rest1) ->
case findShortestPrefix shortest_re rest1 of
Nothing -> TsError . LexicalError $
case rest1 of
(_, _, p):_ -> p
[] -> last_pos
-- If the combined match is empty, we have a lexical error
Just (_, (_, pos1', _):_) | pos1' == pos1 ->
TsError $ LexicalError pos1
Just (Just tok, rest) ->
let
pos2 =
case rest of
(_, _, p):_ -> p
[] -> last_pos
in TsToken (L (Loc pos1 pos2) tok) (go rest)
Just (Nothing, rest) -> go rest
extend :: RE Char a -> RE (Char, Pos, Pos) a
extend = comap (\(c, _, _) -> c)
re :: RE (Char, Pos, Pos) (RE (Char, Pos, Pos) (Maybe tok))
re = extend . fmap extend $
((Just <$>) <$> pToken) <|> ((Nothing <$) <$> pJunk)
annotate
:: String -- ^ source file name
-> String -- ^ contents
-> [(Char, Pos, Pos)] -- ^ the character, its position, and the previous position
annotate src s = snd $ mapAccumL f (startPos src, startPos src) s
where
f (pos, prev_pos) ch =
let pos' = advancePos pos ch
in pos' `seq` ((pos', pos), (ch, pos, prev_pos))