-
Notifications
You must be signed in to change notification settings - Fork 12
/
Interface.hs
314 lines (289 loc) · 10.9 KB
/
Interface.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
{-# LANGUAGE TypeFamilies, GADTs, TupleSections #-}
module Text.Regex.Applicative.Interface where
import Control.Applicative hiding (empty)
import Control.Arrow
import Control.Monad (guard)
import qualified Data.List as List
import Data.Maybe
import Text.Regex.Applicative.Types
import Text.Regex.Applicative.Object
-- | 'RE' is a profunctor. This is its contravariant map.
--
-- (A dependency on the @profunctors@ package doesn't seem justified.)
comap :: (s2 -> s1) -> RE s1 a -> RE s2 a
comap f re =
case re of
Eps -> Eps
Symbol t p -> Symbol t (p . f)
Alt r1 r2 -> Alt (comap f r1) (comap f r2)
App r1 r2 -> App (comap f r1) (comap f r2)
Fmap g r -> Fmap g (comap f r)
CatMaybes r -> CatMaybes (comap f r)
Fail -> Fail
Rep gr fn a r -> Rep gr fn a (comap f r)
Void r -> Void (comap f r)
-- | Match and return any single symbol
anySym :: RE s s
anySym = msym Just
-- | Match zero or more instances of the given expression, which are combined using
-- the given folding function.
--
-- 'Greediness' argument controls whether this regular expression should match
-- as many as possible ('Greedy') or as few as possible ('NonGreedy') instances
-- of the underlying expression.
reFoldl :: Greediness -> (b -> a -> b) -> b -> RE s a -> RE s b
reFoldl g f b a = Rep g f b a
-- | Match zero or more instances of the given expression, but as
-- few of them as possible (i.e. /non-greedily/). A greedy equivalent of 'few'
-- is 'many'.
--
-- Examples:
--
-- >>> findFirstPrefix (few anySym <* "b") "ababab"
-- Just ("a","abab")
-- >>> findFirstPrefix (many anySym <* "b") "ababab"
-- Just ("ababa","")
few :: RE s a -> RE s [a]
few a = reverse <$> Rep NonGreedy (flip (:)) [] a
-- | Return matched symbols as part of the return value
withMatched :: RE s a -> RE s (a, [s])
withMatched Eps = flip (,) [] <$> Eps
withMatched (Symbol t p) = Symbol t (\s -> (,[s]) <$> p s)
withMatched (Alt a b) = withMatched a <|> withMatched b
withMatched (App a b) =
(\(f, s) (x, t) -> (f x, s ++ t)) <$>
withMatched a <*>
withMatched b
withMatched Fail = Fail
withMatched (Fmap f x) = (f *** id) <$> withMatched x
withMatched (CatMaybes x) = CatMaybes $
(\ (as, s) -> flip (,) s <$> as) <$> withMatched x
withMatched (Rep gr f a0 x) =
Rep gr (\(a, s) (x, t) -> (f a x, s ++ t)) (a0, []) (withMatched x)
-- N.B.: this ruins the Void optimization
withMatched (Void x) = (const () *** id) <$> withMatched x
-- | @s =~ a = match a s@
(=~) :: [s] -> RE s a -> Maybe a
(=~) = flip match
infix 2 =~
-- | Attempt to match a string of symbols against the regular expression.
-- Note that the whole string (not just some part of it) should be matched.
--
-- Examples:
--
-- >>> match (sym 'a' <|> sym 'b') "a"
-- Just 'a'
-- >>> match (sym 'a' <|> sym 'b') "ab"
-- Nothing
--
match :: RE s a -> [s] -> Maybe a
match re = let obj = compile re in \str ->
listToMaybe $
results $
foldl (flip step) obj str
-- | Find a string prefix which is matched by the regular expression.
--
-- Of all matching prefixes, pick one using left bias (prefer the left part of
-- '<|>' to the right part) and greediness.
--
-- This is the match which a backtracking engine (such as Perl's one) would find
-- first.
--
-- If match is found, the rest of the input is also returned.
--
-- See also 'findFirstPrefixWithUncons', of which this is a special case.
--
-- Examples:
--
-- >>> findFirstPrefix ("a" <|> "ab") "abc"
-- Just ("a","bc")
-- >>> findFirstPrefix ("ab" <|> "a") "abc"
-- Just ("ab","c")
-- >>> findFirstPrefix "bc" "abc"
-- Nothing
findFirstPrefix :: RE s a -> [s] -> Maybe (a, [s])
findFirstPrefix = findFirstPrefixWithUncons List.uncons
-- | Find the first prefix, with the given @uncons@ function.
--
-- @since 0.3.4
findFirstPrefixWithUncons :: (ss -> Maybe (s, ss)) -> RE s a -> ss -> Maybe (a, ss)
findFirstPrefixWithUncons = findPrefixWith' (walk emptyObject . threads)
where
walk obj [] = (obj, Nothing)
walk obj (t:ts) =
case getResult t of
Just r -> (obj, Just r)
Nothing -> walk (addThread t obj) ts
-- | Find the longest string prefix which is matched by the regular expression.
--
-- Submatches are still determined using left bias and greediness, so this is
-- different from POSIX semantics.
--
-- If match is found, the rest of the input is also returned.
--
-- See also 'findLongestPrefixWithUncons', of which this is a special case.
--
-- Examples:
--
-- >>> import Data.Char
-- >>> let keyword = "if"
-- >>> let identifier = many $ psym isAlpha
-- >>> let lexeme = (Left <$> keyword) <|> (Right <$> identifier)
-- >>> findLongestPrefix lexeme "if foo"
-- Just (Left "if"," foo")
-- >>> findLongestPrefix lexeme "iffoo"
-- Just (Right "iffoo","")
findLongestPrefix :: RE s a -> [s] -> Maybe (a, [s])
findLongestPrefix = findLongestPrefixWithUncons List.uncons
-- | Find the longest prefix, with the given @uncons@ function.
--
-- @since 0.3.4
findLongestPrefixWithUncons :: (ss -> Maybe (s, ss)) -> RE s a -> ss -> Maybe (a, ss)
findLongestPrefixWithUncons = findPrefixWith' ((,) <*> listToMaybe . results)
findPrefixWith'
:: (ReObject s a -> (ReObject s a, Maybe a))
-- ^ Given the regex object, compute the regex object to feed the next input value into, and
-- the result, if any.
-> (ss -> Maybe (s, ss)) -- ^ @uncons@
-> RE s a -> ss -> Maybe (a, ss)
findPrefixWith' walk uncons = \ re -> go (compile re) Nothing
where
go obj resOld ss = case walk obj of
(obj', resThis) ->
let res = flip (,) ss <$> resThis <|> resOld
in
case uncons ss of
_ | failed obj' -> res
Nothing -> res
Just (s, ss) -> go (step s obj') res ss
-- | Find the shortest prefix (analogous to 'findLongestPrefix')
--
-- See also 'findShortestPrefixWithUncons', of which this is a special case.
findShortestPrefix :: RE s a -> [s] -> Maybe (a, [s])
findShortestPrefix = findShortestPrefixWithUncons List.uncons
-- | Find the shortest prefix (analogous to 'findLongestPrefix'), with the given @uncons@ function.
--
-- @since 0.3.4
findShortestPrefixWithUncons :: (ss -> Maybe (s, ss)) -> RE s a -> ss -> Maybe (a, ss)
findShortestPrefixWithUncons uncons = go . compile
where
go obj ss = case results obj of
r:_ -> Just (r, ss)
_ -> do
guard (not (failed obj))
(s, ss) <- uncons ss
go (step s obj) ss
-- | Find the leftmost substring that is matched by the regular expression.
-- Otherwise behaves like 'findFirstPrefix'. Returns the result together with
-- the prefix and suffix of the string surrounding the match.
findFirstInfix :: RE s a -> [s] -> Maybe ([s], a, [s])
findFirstInfix re str =
fmap (\((first, res), last) -> (first, res, last)) $
findFirstPrefix ((,) <$> few anySym <*> re) str
-- Auxiliary function for findExtremeInfix
prefixCounter :: RE s (Int, [s])
prefixCounter = second reverse <$> reFoldl NonGreedy f (0, []) anySym
where
f (i, prefix) s = ((,) $! (i+1)) $ s:prefix
data InfixMatchingState s a = GotResult
{ prefixLen :: !Int
, prefixStr :: [s]
, result :: a
, postfixStr :: [s]
}
| NoResult
-- a `preferOver` b chooses one of a and b, giving preference to a
preferOver
:: InfixMatchingState s a
-> InfixMatchingState s a
-> InfixMatchingState s a
preferOver NoResult b = b
preferOver b NoResult = b
preferOver a b =
case prefixLen a `compare` prefixLen b of
GT -> b -- prefer b when it has smaller prefix
_ -> a -- otherwise, prefer a
mkInfixMatchingState
:: [s] -- rest of input
-> Thread s ((Int, [s]), a)
-> InfixMatchingState s a
mkInfixMatchingState rest thread =
case getResult thread of
Just ((pLen, pStr), res) ->
GotResult
{ prefixLen = pLen
, prefixStr = pStr
, result = res
, postfixStr = rest
}
Nothing -> NoResult
gotResult :: InfixMatchingState s a -> Bool
gotResult GotResult {} = True
gotResult _ = False
-- Algorithm for finding leftmost longest infix match:
--
-- 1. Add a thread /.*?/ to the begginning of the regexp
-- 2. As soon as we get first accept, we delete that thread
-- 3. When we get more than one accept, we choose one by the following criteria:
-- 3.1. Compare by the length of prefix (since we are looking for the leftmost
-- match)
-- 3.2. If they are produced on the same step, choose the first one (left-biased
-- choice)
-- 3.3. If they are produced on the different steps, choose the later one (since
-- they have the same prefixes, later means longer)
findExtremalInfix
:: -- function to combine a later result (first arg) to an earlier one (second
-- arg)
(InfixMatchingState s a -> InfixMatchingState s a -> InfixMatchingState s a)
-> RE s a
-> [s]
-> Maybe ([s], a, [s])
findExtremalInfix newOrOld re str =
case go (compile $ (,) <$> prefixCounter <*> re) str NoResult of
NoResult -> Nothing
r@GotResult{} ->
Just (prefixStr r, result r, postfixStr r)
where
{-
go :: ReObject s ((Int, [s]), a)
-> [s]
-> InfixMatchingState s a
-> InfixMatchingState s a
-}
go obj str resOld =
let resThis =
foldl
(\acc t -> acc `preferOver` mkInfixMatchingState str t)
NoResult $
threads obj
res = resThis `newOrOld` resOld
obj' =
-- If we just found the first result, kill the "prefixCounter" thread.
-- We rely on the fact that it is the last thread of the object.
if gotResult resThis && not (gotResult resOld)
then fromThreads $ init $ threads obj
else obj
in
case str of
[] -> res
_ | failed obj -> res
(s:ss) -> go (step s obj') ss res
-- | Find the leftmost substring that is matched by the regular expression.
-- Otherwise behaves like 'findLongestPrefix'. Returns the result together with
-- the prefix and suffix of the string surrounding the match.
findLongestInfix :: RE s a -> [s] -> Maybe ([s], a, [s])
findLongestInfix = findExtremalInfix preferOver
-- | Find the leftmost substring that is matched by the regular expression.
-- Otherwise behaves like 'findShortestPrefix'. Returns the result together with
-- the prefix and suffix of the string surrounding the match.
findShortestInfix :: RE s a -> [s] -> Maybe ([s], a, [s])
findShortestInfix = findExtremalInfix $ flip preferOver
-- | Replace matches of the regular expression with its value.
--
-- >>> replace ("!" <$ sym 'f' <* some (sym 'o')) "quuxfoofooooofoobarfobar"
-- "quux!!!bar!bar"
replace :: RE s [s] -> [s] -> [s]
replace r = ($ []) . go
where go ys = case findLongestInfix r ys of
Nothing -> (ys ++)
Just (before, m, rest) -> (before ++) . (m ++) . go rest