Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
130 lines (114 sloc) 3.76 KB
{-# LANGUAGE BangPatterns #-}
-- | Tokenizer for ACE. Tokens retain source locations (line and column).
module ACE.Tokenizer where
import ACE.Types.Tokens
import Control.Applicative
import Control.Arrow
import Control.Monad
import Data.Attoparsec.Text hiding (number)
import Data.Char
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T
-- | Tokenize some complete ACE text.
tokenize :: Text -> Either String [Token]
tokenize =
parseOnly (fmap fst tokenizer <* endOfInput)
-- | The tokenizer.
tokenizer :: Parser ([Token],(Int,Int))
tokenizer =
manyWithPos (spaces >=> token)
genitive
(1,0)
-- | Parse a token.
token :: (Int,Int) -> Parser (Token,(Int,Int))
token pos =
number pos <|>
quotedString pos <|>
period pos <|>
comma pos <|>
questionMark pos <|>
word pos
-- | Parse a number.
number :: (Int, Int) -> Parser (Token, (Int,Int))
number pos =
fmap (\n -> (Number pos (either (const 0) fst (T.decimal n)),second (+ T.length n) pos))
(takeWhile1 isDigit)
-- | Parse a quoted string, @\"foobar\"@.
quotedString :: (Int,Int) -> Parser (Token,(Int,Int))
quotedString pos =
char '"' *> (cons <$> takeWhile1 (/='"')) <* char '"'
where
cons x =
(QuotedString pos x
,second (+ (T.length x + 2)) pos)
-- | Parse a period \".\".
period :: (Int,Int) -> Parser (Token,(Int,Int))
period pos =
char '.' *> pure (Period pos,second (+1) pos)
-- | Parse a comma \",\".
comma :: (Int,Int) -> Parser (Token,(Int,Int))
comma pos =
char ',' *> pure (Comma pos,second (+1) pos)
-- | Parse a question mark \"?\".
questionMark :: (Int,Int) -> Parser (Token,(Int,Int))
questionMark pos =
char '?' *> pure (QuestionMark pos,second (+1) pos)
-- | Parse a word, which is any sequence of non-whitespace words
-- containing none of the other token characters.
word :: (Int,Int) -> Parser (Token,(Int,Int))
word pos =
cons <$> takeWhile1 wordChar
where
cons w = (Word pos w,second (+ T.length w) pos)
wordChar c =
not (isSpace c) &&
c /= '"' &&
c /= '.' &&
c /= '?' &&
c /= ',' &&
c /= '\''
-- | Parse the Saxon genitive ' or 's. This is ran after parsing every
-- token, but is expected to fail most of the time.
genitive :: (Int, Int) -> Parser (Maybe (Token,(Int, Int)))
genitive pos =
optional go
where
go =
do _ <- char '\''
ms <- peekChar
case ms of
Just 's' -> anyChar *> pure (Genitive pos True,second (+1) pos)
_ -> pure (Genitive pos False,second (+1) pos)
-- | Like 'many', but retains the current source position and supports
-- postfix-parsing of the genitive apostrophe.
manyWithPos :: (Monad m, Alternative m)
=> ((t, t1) -> m (a, (t, t1))) -> ((t, t1) -> m (Maybe (a, (t, t1))))
-> (t, t1) -> m ([a], (t, t1))
manyWithPos p p' pos =
do r <- fmap (first Just) (p pos) <|> pure (Nothing,pos)
case r of
(Nothing,_) ->
return ([],pos)
(Just x,newpos@(!_,!_)) ->
do r' <- p' newpos
case r' of
Nothing ->
do (xs,finalpos) <- manyWithPos p p' newpos
return (x:xs,finalpos)
Just (y,newpos') ->
do (xs,finalpos) <- manyWithPos p p' newpos'
return (x:y:xs,finalpos)
-- | Skip spaces (space, newline, tab (=4 spaces)) and keep
-- positioning information up to date.
spaces :: (Int,Int) -> Parser (Int,Int)
spaces (sline,scol) =
go sline scol
where
go line col =
do c <- peekChar
case c of
Just '\n' -> anyChar *> go (line+1) 0
Just ' ' -> anyChar *> go line (col+1)
Just '\t' -> anyChar *> go line (col+4)
_ -> return (line,col)