Skip to content

Commit

Permalink
Worked in unicode spaces and line separators.
Browse files Browse the repository at this point in the history
  • Loading branch information
alanz committed Dec 27, 2010
1 parent e4d7e3b commit 370c4fb
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 31 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@
/src/Language/JavaScript/Parser.hs~
/parse.txt
/TODO.txt~
/test/Unicode.js~
2 changes: 1 addition & 1 deletion language-javascript.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Library
Language.JavaScript.Parser.StringEscape
Language.JavaScript.Parser.Token
Build-tools: happy, alex
-- ghc-options: -Wall
ghc-options: -Wall

executable runtests
if flag(buildtests)
Expand Down
21 changes: 21 additions & 0 deletions runtests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import Test.Framework.Providers.HUnit
import Test.HUnit hiding (Test)


import Control.Monad (liftM)
import Language.JavaScript.Parser.Parser
import Language.JavaScript.Parser.Grammar

Expand Down Expand Up @@ -228,6 +229,16 @@ testSuite = testGroup "Parser"
, testCase "bug1" (testProg "/* */\nfunction f() {\n/* */\n}\n" "Right (JSSourceElementsTop [JSFunction (JSIdentifier \"f\") [] (JSFunctionBody [])])")
, testCase "bug1" (testProg "/* **/\nfunction f() {\n/* */\n}\n" "Right (JSSourceElementsTop [JSFunction (JSIdentifier \"f\") [] (JSFunctionBody [])])")

, testCase "unicode1-ws" (testProg "a \f\v\t\r\n=\x00a0\x1680\x180e\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x2028\x2029\x202f\x205f\x3000\&1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"a\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")

, testCase "unicode2-lt" (testProg "//comment\x000Ax=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")
, testCase "unicode3-lt" (testProg "//comment\x000Dx=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")
, testCase "unicode4-lt" (testProg "//comment\x2028x=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")
, testCase "unicode5-lt" (testProg "//comment\x2029x=1;" "Right (JSSourceElementsTop [JSExpression [JSElement \"assignmentExpression\" [JSIdentifier \"x\",JSOperator \"=\",JSDecimal \"1\"]],JSLiteral \";\"])")

, testCase "unicode2" (testProg "àáâãäå = 1;" "")
, testCase "unicode3" (testFile "./test/Unicode.js" "")

]

srcHelloWorld = "Hello"
Expand All @@ -248,5 +259,15 @@ testStmt str expected = expected @=? (show $ parseUsing parseStatement str "src"

testProg str expected = expected @=? (show $ parseUsing parseProgram str "src")

testFile fileName expected = do
res <- parseFile fileName
-- expected @=? (liftM show $ parseFile fileName)
(expected @=? (show res))


-- Set emacs mode
-- Local Variables:
-- coding: utf-8
-- End:

-- EOF
7 changes: 4 additions & 3 deletions src/Language/JavaScript/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ module Language.JavaScript.Parser
, JSNode(..)
, ParseError(..)
-- Source locations
, SrcLocation (..)
, SrcSpan (..)
, Span (..)
, AlexPosn
-- , SrcLocation (..)
-- , SrcSpan (..)
-- , Span (..)
-- ParserMonad
, P
, ParseState (..)
Expand Down
60 changes: 42 additions & 18 deletions src/Language/JavaScript/Parser/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ module Language.JavaScript.Parser.Lexer (
, lexCont
) where

import Control.Monad
--import Control.Monad
import Language.JavaScript.Parser.LexerUtils
import Language.JavaScript.Parser.ParserMonad
import Language.JavaScript.Parser.SrcLocation
import Language.JavaScript.Parser.Token
import qualified Data.Map as Map
import Data.Word (Word8)
--import Data.Word (Word8)

import Codec.Binary.UTF8.Light as UTF8

Expand All @@ -38,7 +38,9 @@ $ident_letter = [a-zA-Z_]

$any_char = [\x00-\xff]

$eol_char = [$lf $cr] -- any end of line character

$eol_char = [\x000A\x000D\x2028\x2029] -- any end of line character
--$eol_char = [$lf $cr] -- any end of line character
$not_eol_char = ~$eol_char -- anything but an end of line character


Expand Down Expand Up @@ -73,15 +75,37 @@ $RegExpChars = [$printable] # [ $cr $lf \\ \/]
$MultiLineNotAsteriskChar = [$any_char] # [\*]
$MultiLineNotForwardSlashOrAsteriskChar = [$any_char] # [\* \/]
-- WhiteSpace ::
-- <TAB>
-- <VT>
-- <FF>
-- <SP>
-- <NBSP>
-- <USP>
-- TODO: bring in NBSP and USP
$white_char = [\ \f\v\t\r\n]
-- See http://blog.stevenlevithan.com/archives/javascript-regex-and-unicode
-- * \u0009 — Tab — \t
-- * \u000a — Line feed — \n — (newline character)
-- * \u000b — Vertical tab — \v
-- * \u000c — Form feed — \f
-- * \u000d — Carriage return — \r — (newline character)
-- * \u0020 — Space
-- * \u00a0 — No-break space
-- * \u1680 — Ogham space mark
-- * \u180e — Mongolian vowel separator
-- * \u2000 — En quad
-- * \u2001 — Em quad
-- * \u2002 — En space
-- * \u2003 — Em space
-- * \u2004 — Three-per-em space
-- * \u2005 — Four-per-em space
-- * \u2006 — Six-per-em space
-- * \u2007 — Figure space
-- * \u2008 — Punctuation space
-- * \u2009 — Thin space
-- * \u200a — Hair space
-- * \u2028 — Line separator — (newline character)
-- * \u2029 — Paragraph separator — (newline character)
-- * \u202f — Narrow no-break space
-- * \u205f — Medium mathematical space
-- * \u3000 — Ideographic space
--$white_char = [\ \f\v\t\r\n]
$white_char = [\x0009\x000a\x000b\x000c\x000d\x0020\x00a0\x1680\x180e\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x2028\x2029\x202f\x205f\x3000]
-- ! ------------------------------------------------- Terminals
tokens :-
Expand Down Expand Up @@ -221,7 +245,7 @@ tokens :-
The method is inspired by the lexer in http://jint.codeplex.com/

-}
--classifyToken :: Token -> Int
classifyToken :: Token -> Int
classifyToken token =
case token of
IdentifierToken {} -> divide
Expand Down Expand Up @@ -287,25 +311,25 @@ lexCont cont = do
utf8Encode :: Char -> [Byte]
utf8Encode c = head (UTF8.encodeUTF8' [UTF8.c2w c])

alexEOF = EOFToken alexSpanEmpty

--alexEOF = EOFToken alexSpanEmpty

ignorePendingBytes (p,c,ps,s) = (p,c,s)
ignorePendingBytes :: forall t t1 t2 t3. (t, t1, t2, t3) -> (t, t1, t3)
ignorePendingBytes (p,c,_ps,s) = (p,c,s)


alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (p,c,bs,s) = c

alexGetByte :: AlexInput -> Maybe (Byte,AlexInput)
alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s))
alexGetByte (p,c,[],[]) = Nothing
alexGetByte (_p,_c,[],[]) = Nothing
alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c
(b:bs) = utf8Encode c
in p' `seq` Just (b, (p', c, bs, s))

alexMove :: AlexPosn -> Char -> AlexPosn
alexMove (AlexPn a l c) '\t' = AlexPn (a+1) l (((c+7) `div` 8)*8+1)
alexMove (AlexPn a l c) '\n' = AlexPn (a+1) (l+1) 1
alexMove (AlexPn a l _c) '\n' = AlexPn (a+1) (l+1) 1
alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1)

-- ---------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions src/Language/JavaScript/Parser/LexerUtils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ endOfLine lexToken span _len _str = do
-}

--symbolToken :: (AlexSpan -> Token) -> Action
symbolToken :: (Monad m) => (t -> a) -> t -> t1 -> t2 -> m a
--symbolToken :: (Monad m) => (AlexSpan -> Token) -> t -> t1 -> t2 -> m Token
symbolToken mkToken location _ _ = return (mkToken location)
--symbolToken mkToken location = return (mkToken location)

Expand All @@ -60,6 +62,8 @@ endOfFileToken = EOFToken alexSpanEmpty


--mkString :: (AlexSpan -> String -> Token) -> Action
mkString
:: (Monad m) => (t -> [a1] -> a) -> t -> Int -> [a1] -> m a
mkString toToken loc len str = do
return $ toToken loc (take len str)

Expand Down
8 changes: 4 additions & 4 deletions src/Language/JavaScript/Parser/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import Language.JavaScript.Parser.ParseError
import Language.JavaScript.Parser.Grammar
import Language.JavaScript.Parser.Lexer
import Language.JavaScript.Parser.ParserMonad
import Language.JavaScript.Parser.SrcLocation
--import Language.JavaScript.Parser.SrcLocation
import qualified Language.JavaScript.Parser.AST as AST

-- | Parse one compound statement, or a sequence of simple statements.
Expand All @@ -22,7 +22,7 @@ parseStmtKeepComments :: String -- ^ The input stream (Javascript source code).
-> String -- ^ The name of the Javascript source (filename or input device).
-> Either ParseError (AST.JSNode, [Token])
-- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
parseStmtKeepComments input srcName =
parseStmtKeepComments input _srcName =
execParserKeepComments parseProgram state
where
state = initialState input
Expand All @@ -35,7 +35,7 @@ parse :: String -- ^ The input stream (Javascript source code).
-> String -- ^ The name of the Javascript source (filename or input device).
-> Either ParseError AST.JSNode
-- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
parse input srcName =
parse input _srcName =
execParser parseProgram state
where
state = initialState input
Expand All @@ -62,7 +62,7 @@ parseUsing ::
-> String -- ^ The name of the Javascript source (filename or input device).
-> Either ParseError AST.JSNode
-- ^ An error or maybe the abstract syntax tree (AST) of zero or more Javascript statements, plus comments.
parseUsing p input srcName =
parseUsing p input _srcName =
execParser p state
where
state = initialState input
Expand Down
4 changes: 2 additions & 2 deletions src/Language/JavaScript/Parser/ParserMonad.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ module Language.JavaScript.Parser.ParserMonad
, addComment
, getComments
, spanError
, AlexInput (..)
, AlexInput
, Byte
) where

Expand All @@ -43,7 +43,7 @@ import Control.Monad.Error as Error
import Control.Monad.State.Class
import Control.Monad.State.Strict as State
import Language.JavaScript.Parser.ParseError (ParseError (..))
import Language.JavaScript.Parser.SrcLocation (AlexPosn (..), alexStartPos, alexSpanEmpty, SrcLocation (..), SrcSpan (..), Span (..))
import Language.JavaScript.Parser.SrcLocation (AlexPosn (..), alexStartPos, alexSpanEmpty, Span (..))
import Language.JavaScript.Parser.Token (Token (..))
import Prelude hiding (span)
import Data.Word (Word8)
Expand Down
14 changes: 12 additions & 2 deletions src/Language/JavaScript/Parser/SrcLocation.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
module Language.JavaScript.Parser.SrcLocation (
-- * Construction
AlexPosn (..),
AlexSpan (..),
AlexSpan,
alexStartPos,
alexSpanEmpty,
SrcLocation (..),
Expand All @@ -41,14 +41,24 @@ module Language.JavaScript.Parser.SrcLocation (
import Data.Data
import Prelude hiding (span)

data AlexPosn = AlexPn !Int !Int !Int
-- | `Posn' records the location of a token in the input text. It has three
-- fields: the address (number of characters preceding the token), line number
-- and column of a token within the file. `start_pos' gives the position of the
-- start of the file and `eof_pos' a standard encoding for the end of file.
-- `move_pos' calculates the new position after traversing a given character,
-- assuming the usual eight character tab stops.

data AlexPosn = AlexPn !Int -- address (number of characters preceding the token)
!Int -- line number
!Int -- column
deriving (Eq,Show)

alexStartPos :: AlexPosn
alexStartPos = AlexPn 0 1 1

-- AZ bringing this in as SrcSpan replacement.
type AlexSpan = (AlexPosn, Char, String)
alexSpanEmpty :: AlexSpan
alexSpanEmpty = (alexStartPos, '\n', "")

-- | A location for a syntactic entity from the source code.
Expand Down
2 changes: 1 addition & 1 deletion src/Language/JavaScript/Parser/Token.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ module Language.JavaScript.Parser.Token (
) where

--import Language.JavaScript.Parser.Pretty
import Language.JavaScript.Parser.SrcLocation (AlexSpan (..),SrcSpan (..), Span(getSpan))
import Language.JavaScript.Parser.SrcLocation (AlexSpan)
import Data.Data

-- | Lexical tokens.
Expand Down
6 changes: 6 additions & 0 deletions test/Unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// -*- coding: utf-8 -*-

àáâãäå = 1;



30 changes: 30 additions & 0 deletions test/unicode.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-*- coding: utf-8; mode: xub -*-
¢ € ₠ £ ¥ ¤
° © ® ™ § ¶ † ‡ ※
•◦ ‣ ✓ ●■◆ ○□◇ ★☆ ♠♣♥♦ ♤♧♡♢
“” ‘’ ¿¡ «» ‹› ¶§ª - ‐ ‑ ‒ – — ― …
àáâãäåæç èéêë ìíîï ðñòóôõö øùúûüýþÿ ÀÁÂÃÄÅ Ç ÈÉÊË ÌÍÎÏ ÐÑ ÒÓÔÕÖ ØÙÚÛÜÝÞß
Æ ᴁ ᴂ ᴈ
ΑΒΓΔ ΕΖΗΘ ΙΚΛΜ ΝΞΟΠ ΡΣΤΥ ΦΧΨΩ αβγδ εζηθ ικλμ νξοπ ρςτυ φχψω
⌈⌉ ⌊⌋ ∏ ∑ ∫ ×÷ ⊕ ⊖ ⊗ ⊘ ⊙ ∙ ∘ ′ ″ ‴ ∼ ∂ √ ≔ × ⁱ ⁰ ¹ ² ³ ₀ ₁ ₂
π ∞ ± ∎
∀¬∧∨∃⊦∵∴∅∈∉⊂⊃⊆⊇⊄⋂⋃
≠≤≥≮≯≫≪≈≡
ℕℤℚℝℂ
←→↑↓ ↔ ↖↗↙↘ ⇐⇒⇑⇓ ⇔⇗ ⇦⇨⇧⇩ ↞↠↟↡ ↺↻ ☞☜☝☟
λ ƒ Ɱ
⌘ ⌥ ‸ ⇧ ⌤ ↑ ↓ → ← ⇞ ⇟ ↖ ↘ ⌫ ⌦ ⎋⏏ ↶↷ ◀▶▲▼ ◁▷△▽ ⇄ ⇤⇥ ↹ ↵↩⏎ ⌧ ⌨ ␣ ⌶ ⎗⎘⎙⎚ ⌚⌛ ✂✄ ✉✍

♩♪♫♬♭♮♯
➀➁➂➃➄➅➆➇➈➉
卐卍✝✚✡☥⎈☭☪☮☺☹ ☯☰☱☲☳☴☵☶☷ ☠☢☣☤♲♳⌬♨♿ ☉☼☾☽ ♀♂ ♔♕♖ ♗♘♙ ♚♛ ♜♝♞♟
 、。!,:「」『』〈〉《》〖〗【】〔〕

ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ

林花謝了春紅 太匆匆, 無奈朝來寒雨 晚來風
胭脂淚 留人醉 幾時重, 自是人生長恨 水長東

http://xahlee.org/emacs/unicode-browser.html
http://xahlee.org/Periodic_dosage_dir/t1/20040505_unicode.html

0 comments on commit 370c4fb

Please sign in to comment.