Worked in unicode spaces and line separators.

erikd · Dec 27, 2010 · 370c4fb · 370c4fb
1 parent e4d7e3b
commit 370c4fb
Show file tree

Hide file tree

Showing 12 changed files with 128 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@
 /src/Language/JavaScript/Parser.hs~
 /parse.txt
 /TODO.txt~
+/test/Unicode.js~
diff --git a/language-javascript.cabal b/language-javascript.cabal
@@ -43,7 +43,7 @@ Library
                        Language.JavaScript.Parser.StringEscape
                        Language.JavaScript.Parser.Token
   Build-tools:         happy, alex
-  -- ghc-options:         -Wall
+  ghc-options:         -Wall
 
 executable             runtests
     if flag(buildtests)

diff --git a/runtests.hs b/runtests.hs
@@ -4,6 +4,7 @@ import Test.Framework.Providers.HUnit
 import Test.HUnit hiding (Test)
 
 
+import Control.Monad (liftM)
 import Language.JavaScript.Parser.Parser
 import Language.JavaScript.Parser.Grammar
 
@@ -228,6 +229,16 @@ testSuite = testGroup "Parser"
    , testCase "bug1" (testProg "/* */\nfunction f() {\n/*  */\n}\n" "Right (JSSourceElementsTop [JSFunction (JSIdentifier \"f\") [] (JSFunctionBody [])])")
    , testCase "bug1" (testProg "/* **/\nfunction f() {\n/*  */\n}\n" "Right (JSSourceElementsTop [JSFunction (JSIdentifier \"f\") [] (JSFunctionBody [])])")
 
+   , testCase "unicode1-ws" (testProg "a \f\v\t\r\n=\x00a0\x1680\x180e\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x2028\x2029\x202f\x205f\x3000\&1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"a\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")
+
+   , testCase "unicode2-lt" (testProg "//comment\x000Ax=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")  
+   , testCase "unicode3-lt" (testProg "//comment\x000Dx=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")  
+   , testCase "unicode4-lt" (testProg "//comment\x2028x=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")  
+   , testCase "unicode5-lt" (testProg "//comment\x2029x=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")  
+
+   , testCase "unicode2" (testProg "àáâãäå = 1;" "")
+   , testCase "unicode3" (testFile "./test/Unicode.js" "")  
+
     ]
 
 srcHelloWorld = "Hello"
@@ -248,5 +259,15 @@ testStmt str expected = expected @=? (show $ parseUsing parseStatement str "src"
 
 testProg str expected = expected @=? (show $ parseUsing parseProgram str "src")
 
+testFile fileName expected = do
+  res <- parseFile fileName
+  -- expected @=? (liftM show $ parseFile fileName)
+  (expected @=? (show res))
+
+
+-- Set emacs mode
+-- Local Variables: 
+-- coding: utf-8
+-- End:             
 
 -- EOF
diff --git a/src/Language/JavaScript/Parser.hs b/src/Language/JavaScript/Parser.hs
@@ -6,9 +6,10 @@ module Language.JavaScript.Parser
        , JSNode(..)  
        , ParseError(..)  
        -- Source locations  
-       , SrcLocation (..)
-       , SrcSpan (..)
-       , Span (..)
+       , AlexPosn
+       -- , SrcLocation (..)
+       -- , SrcSpan (..)
+       -- , Span (..)
        -- ParserMonad  
        , P  
        , ParseState (..)  

diff --git a/src/Language/JavaScript/Parser/Lexer.x b/src/Language/JavaScript/Parser/Lexer.x
@@ -5,13 +5,13 @@ module Language.JavaScript.Parser.Lexer (
     , lexCont 
     ) where
 
-import Control.Monad
+--import Control.Monad
 import Language.JavaScript.Parser.LexerUtils
 import Language.JavaScript.Parser.ParserMonad 
 import Language.JavaScript.Parser.SrcLocation
 import Language.JavaScript.Parser.Token
 import qualified Data.Map as Map
-import Data.Word (Word8)
+--import Data.Word (Word8)
 
 import Codec.Binary.UTF8.Light as UTF8
 
@@ -38,7 +38,9 @@ $ident_letter = [a-zA-Z_]
 
 $any_char = [\x00-\xff]
 
-$eol_char = [$lf $cr] -- any end of line character
+
+$eol_char = [\x000A\x000D\x2028\x2029] -- any end of line character
+--$eol_char = [$lf $cr] -- any end of line character
 $not_eol_char = ~$eol_char -- anything but an end of line character
 
 
@@ -73,15 +75,37 @@ $RegExpChars = [$printable] # [ $cr $lf \\ \/]
 $MultiLineNotAsteriskChar               = [$any_char] # [\*]
 $MultiLineNotForwardSlashOrAsteriskChar = [$any_char] # [\* \/]
 
--- WhiteSpace ::
---      <TAB>
---      <VT>
---      <FF>
---      <SP>
---      <NBSP>
---      <USP>
--- TODO: bring in NBSP and USP
-$white_char   = [\ \f\v\t\r\n]
+
+-- See http://blog.stevenlevithan.com/archives/javascript-regex-and-unicode
+    -- *  \u0009 — Tab — \t
+    -- * \u000a — Line feed — \n — (newline character)
+    -- * \u000b — Vertical tab — \v
+    -- * \u000c — Form feed — \f
+    -- * \u000d — Carriage return — \r — (newline character)
+    -- * \u0020 — Space
+    -- * \u00a0 — No-break space
+    -- * \u1680 — Ogham space mark
+    -- * \u180e — Mongolian vowel separator
+    -- * \u2000 — En quad
+    -- * \u2001 — Em quad
+    -- * \u2002 — En space
+    -- * \u2003 — Em space
+    -- * \u2004 — Three-per-em space
+    -- * \u2005 — Four-per-em space
+    -- * \u2006 — Six-per-em space
+    -- * \u2007 — Figure space
+    -- * \u2008 — Punctuation space
+    -- * \u2009 — Thin space
+    -- * \u200a — Hair space
+    -- * \u2028 — Line separator — (newline character)
+    -- * \u2029 — Paragraph separator — (newline character)
+    -- * \u202f — Narrow no-break space
+    -- * \u205f — Medium mathematical space
+    -- * \u3000 — Ideographic space
+
+--$white_char   = [\ \f\v\t\r\n]
+$white_char = [\x0009\x000a\x000b\x000c\x000d\x0020\x00a0\x1680\x180e\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x2028\x2029\x202f\x205f\x3000]
+
 
 -- ! ------------------------------------------------- Terminals
 tokens :-
@@ -221,7 +245,7 @@ tokens :-
 The method is inspired by the lexer in http://jint.codeplex.com/
 
 -}
---classifyToken :: Token -> Int
+classifyToken :: Token -> Int
 classifyToken token = 
    case token of
       IdentifierToken {} -> divide
@@ -287,25 +311,25 @@ lexCont cont = do
 utf8Encode :: Char -> [Byte]
 utf8Encode c = head (UTF8.encodeUTF8' [UTF8.c2w c])
 
-alexEOF = EOFToken alexSpanEmpty
-
+--alexEOF = EOFToken alexSpanEmpty
 
-ignorePendingBytes (p,c,ps,s) = (p,c,s)
+ignorePendingBytes :: forall t t1 t2 t3. (t, t1, t2, t3) -> (t, t1, t3)
+ignorePendingBytes (p,c,_ps,s) = (p,c,s)
 
 
 alexInputPrevChar :: AlexInput -> Char
 alexInputPrevChar (p,c,bs,s) = c
 
 alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
 alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
-alexGetByte (p,c,[],[]) = Nothing
+alexGetByte (_p,_c,[],[]) = Nothing
 alexGetByte (p,_,[],(c:s))  = let p' = alexMove p c 
                                   (b:bs) = utf8Encode c
                               in p' `seq`  Just (b, (p', c, bs, s))
 
 alexMove :: AlexPosn -> Char -> AlexPosn
 alexMove (AlexPn a l c) '\t' = AlexPn (a+1)  l     (((c+7) `div` 8)*8+1)
-alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1)   1
+alexMove (AlexPn a l _c) '\n' = AlexPn (a+1) (l+1)   1
 alexMove (AlexPn a l c) _    = AlexPn (a+1)  l     (c+1)
 
 -- ---------------------------------------------------------------------

diff --git a/src/Language/JavaScript/Parser/LexerUtils.hs b/src/Language/JavaScript/Parser/LexerUtils.hs
@@ -50,6 +50,8 @@ endOfLine lexToken span _len _str = do
 -}
 
 --symbolToken :: (AlexSpan -> Token) -> Action 
+symbolToken :: (Monad m) => (t -> a) -> t -> t1 -> t2 -> m a
+--symbolToken :: (Monad m) => (AlexSpan -> Token) -> t -> t1 -> t2 -> m Token
 symbolToken mkToken location _ _ = return (mkToken location)
 --symbolToken mkToken location = return (mkToken location)
 
@@ -60,6 +62,8 @@ endOfFileToken = EOFToken alexSpanEmpty
 
 
 --mkString :: (AlexSpan -> String -> Token) -> Action
+mkString
+  :: (Monad m) => (t -> [a1] -> a) -> t -> Int -> [a1] -> m a
 mkString toToken loc len str = do
    return $ toToken loc (take len str)
 

diff --git a/src/Language/JavaScript/Parser/Parser.hs b/src/Language/JavaScript/Parser/Parser.hs
@@ -12,7 +12,7 @@ import Language.JavaScript.Parser.ParseError
 import Language.JavaScript.Parser.Grammar
 import Language.JavaScript.Parser.Lexer
 import Language.JavaScript.Parser.ParserMonad
-import Language.JavaScript.Parser.SrcLocation
+--import Language.JavaScript.Parser.SrcLocation
 import qualified Language.JavaScript.Parser.AST as AST
 
 -- | Parse one compound statement, or a sequence of simple statements. 
@@ -22,7 +22,7 @@ parseStmtKeepComments :: String -- ^ The input stream (Javascript source code).
       -> String -- ^ The name of the Javascript source (filename or input device). 
       -> Either ParseError (AST.JSNode, [Token]) 
          -- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
-parseStmtKeepComments input srcName = 
+parseStmtKeepComments input _srcName = 
    execParserKeepComments parseProgram state 
    where
      state = initialState input 
@@ -35,7 +35,7 @@ parse :: String -- ^ The input stream (Javascript source code).
       -> String -- ^ The name of the Javascript source (filename or input device). 
       -> Either ParseError AST.JSNode 
          -- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
-parse input srcName = 
+parse input _srcName = 
    execParser parseProgram state 
    where
      state = initialState input 
@@ -62,7 +62,7 @@ parseUsing ::
       -> String -- ^ The name of the Javascript source (filename or input device). 
       -> Either ParseError AST.JSNode 
          -- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
-parseUsing p input srcName = 
+parseUsing p input _srcName = 
    execParser p state 
    where
      state = initialState input

diff --git a/src/Language/JavaScript/Parser/ParserMonad.hs b/src/Language/JavaScript/Parser/ParserMonad.hs
@@ -34,7 +34,7 @@ module Language.JavaScript.Parser.ParserMonad
    , addComment
    , getComments
    , spanError
-   , AlexInput (..)
+   , AlexInput 
    , Byte  
    ) where
 
@@ -43,7 +43,7 @@ import Control.Monad.Error as Error
 import Control.Monad.State.Class
 import Control.Monad.State.Strict as State
 import Language.JavaScript.Parser.ParseError (ParseError (..))
-import Language.JavaScript.Parser.SrcLocation (AlexPosn (..), alexStartPos, alexSpanEmpty, SrcLocation (..), SrcSpan (..), Span (..))
+import Language.JavaScript.Parser.SrcLocation (AlexPosn (..), alexStartPos, alexSpanEmpty, Span (..))
 import Language.JavaScript.Parser.Token (Token (..))
 import Prelude hiding (span)
 import Data.Word (Word8)

diff --git a/src/Language/JavaScript/Parser/SrcLocation.hs b/src/Language/JavaScript/Parser/SrcLocation.hs
@@ -15,7 +15,7 @@
 module Language.JavaScript.Parser.SrcLocation (
   -- * Construction 
   AlexPosn (..),
-  AlexSpan (..),
+  AlexSpan,
   alexStartPos,
   alexSpanEmpty,
   SrcLocation (..),
@@ -41,14 +41,24 @@ module Language.JavaScript.Parser.SrcLocation (
 import Data.Data
 import Prelude hiding (span)
 
-data AlexPosn = AlexPn !Int !Int !Int
+-- | `Posn' records the location of a token in the input text.  It has three
+-- fields: the address (number of characters preceding the token), line number
+-- and column of a token within the file. `start_pos' gives the position of the
+-- start of the file and `eof_pos' a standard encoding for the end of file.
+-- `move_pos' calculates the new position after traversing a given character,
+-- assuming the usual eight character tab stops.
+
+data AlexPosn = AlexPn !Int -- address (number of characters preceding the token)
+                       !Int -- line number
+                       !Int -- column
         deriving (Eq,Show)
 
 alexStartPos :: AlexPosn
 alexStartPos = AlexPn 0 1 1
 
 -- AZ bringing this in as SrcSpan replacement.
 type AlexSpan = (AlexPosn, Char, String)
+alexSpanEmpty :: AlexSpan
 alexSpanEmpty = (alexStartPos, '\n', "")
 
 -- | A location for a syntactic entity from the source code.

diff --git a/src/Language/JavaScript/Parser/Token.hs b/src/Language/JavaScript/Parser/Token.hs
@@ -23,7 +23,7 @@ module Language.JavaScript.Parser.Token (
    ) where
 
 --import Language.JavaScript.Parser.Pretty
-import Language.JavaScript.Parser.SrcLocation (AlexSpan (..),SrcSpan (..), Span(getSpan))
+import Language.JavaScript.Parser.SrcLocation (AlexSpan)
 import Data.Data
 
 -- | Lexical tokens.

diff --git a/test/Unicode.js b/test/Unicode.js
@@ -0,0 +1,6 @@
+// -*- coding: utf-8 -*-
+
+àáâãäå = 1;
+
+
+
diff --git a/test/unicode.txt b/test/unicode.txt
@@ -0,0 +1,30 @@
+-*- coding: utf-8; mode: xub -*-
+¢ € ₠ £ ¥ ¤
+ ° © ® ™ § ¶ † ‡ ※
+ •◦ ‣ ✓ ●■◆ ○□◇ ★☆ ♠♣♥♦ ♤♧♡♢
+ “” ‘’ ¿¡  «» ‹›  ¶§ª - ‐ ‑ ‒ – — ― …
+àáâãäåæç èéêë ìíîï ðñòóôõö øùúûüýþÿ ÀÁÂÃÄÅ Ç ÈÉÊË ÌÍÎÏ ÐÑ ÒÓÔÕÖ ØÙÚÛÜÝÞß 
+Æ  ᴁ ᴂ ᴈ
+ ΑΒΓΔ ΕΖΗΘ ΙΚΛΜ ΝΞΟΠ ΡΣΤΥ ΦΧΨΩ αβγδ εζηθ ικλμ νξοπ ρςτυ φχψω
+ ⌈⌉ ⌊⌋ ∏ ∑ ∫ ×÷ ⊕ ⊖ ⊗ ⊘ ⊙ ∙ ∘ ′ ″ ‴ ∼ ∂ √ ≔ × ⁱ ⁰ ¹ ² ³ ₀ ₁ ₂
+ π ∞ ± ∎
+ ∀¬∧∨∃⊦∵∴∅∈∉⊂⊃⊆⊇⊄⋂⋃
+ ≠≤≥≮≯≫≪≈≡
+ ℕℤℚℝℂ
+ ←→↑↓ ↔ ↖↗↙↘  ⇐⇒⇑⇓ ⇔⇗  ⇦⇨⇧⇩ ↞↠↟↡ ↺↻ ☞☜☝☟
+λ ƒ Ɱ
+ ⌘ ⌥ ‸ ⇧ ⌤ ↑ ↓ → ← ⇞ ⇟ ↖ ↘ ⌫ ⌦ ⎋⏏ ↶↷ ◀▶▲▼ ◁▷△▽ ⇄ ⇤⇥ ↹ ↵↩⏎ ⌧ ⌨ ␣ ⌶ ⎗⎘⎙⎚ ⌚⌛ ✂✄ ✉✍
+
+ ♩♪♫♬♭♮♯
+ ➀➁➂➃➄➅➆➇➈➉
+ 卐卍✝✚✡☥⎈☭☪☮☺☹ ☯☰☱☲☳☴☵☶☷ ☠☢☣☤♲♳⌬♨♿ ☉☼☾☽ ♀♂ ♔♕♖ ♗♘♙ ♚♛ ♜♝♞♟
+ ❦
+　、。！，：「」『』〈〉《》〖〗【】〔〕
+
+ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ
+
+林花謝了春紅 太匆匆, 無奈朝來寒雨 晚來風
+胭脂淚 留人醉 幾時重, 自是人生長恨 水長東
+
+ http://xahlee.org/emacs/unicode-browser.html
+ http://xahlee.org/Periodic_dosage_dir/t1/20040505_unicode.html