Skip to content

Commit

Permalink
Working on full unicode set for identifiers. It is huge - 14980 diffe…
Browse files Browse the repository at this point in the history
…rent characters
  • Loading branch information
alanz committed Dec 27, 2010
1 parent 0d73c12 commit c65678d
Show file tree
Hide file tree
Showing 4 changed files with 15,011 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Expand Up @@ -20,3 +20,10 @@
/parse.txt
/TODO.txt~
/test/Unicode.js~
/unicode/doit.sh~
/unicode/uc-ll.htm
/unicode/uc-lm.htm
/unicode/uc-lo.htm
/unicode/uc-lt.htm
/unicode/uc-lu.htm
/unicode/uc-nl.htm
9 changes: 9 additions & 0 deletions src/Language/JavaScript/Parser/Lexer.x
Expand Up @@ -105,6 +105,15 @@ $MultiLineNotForwardSlashOrAsteriskChar = [$any_char] # [\* \/]
--$white_char = [\ \f\v\t\r\n]
$white_char = [\x0009\x000a\x000b\x000c\x000d\x0020\x00a0\x1680\x180e\x2000\x2001\x2002\x2003\x2004\x2005\x2006\x2007\x2008\x2009\x200a\x2028\x2029\x202f\x205f\x3000]
-- Identifier characters
-- UnicodeLetter
-- any character in the Unicode categories “Uppercase letter (Lu)”, “Lowercase letter (Ll)”,
-- “Titlecase letter (Lt)”, “Modifier letter (Lm)”, “Other letter (Lo)”, or “Letter number (Nl)”.
-- http://www.fileformat.info/info/unicode/category/Lu/list.htm
$UnicodeLu = [\x0041-\x005a\x00c0-\x00de\x0100\x0102\x0104\x0106\x0108\x010a\x010c\x010e\x0110\x0112\x0114\x0116\x0118\x011a\x011c\x011e\x0120\x0122\x0124\x0128\x012a\x012c\x012e\x0130\x0132\x0134\x0136\x0139\x013b\x013d\x013f\x0141\x0143\x0145\x0147\x014a\x014c\x014e\x0150\x0152\x0154\x0158]
-- ! ------------------------------------------------- Terminals
tokens :-
Expand Down
15 changes: 15 additions & 0 deletions unicode/doit.sh
@@ -0,0 +1,15 @@
#!/bin/sh

# Identifier characters
# UnicodeLetter
# any character in the Unicode categories “Uppercase letter (Lu)”, “Lowercase letter (Ll)”,
# “Titlecase letter (Lt)”, “Modifier letter (Lm)”, “Other letter (Lo)”, or “Letter number (Nl)”.

wget -c 'http://www.fileformat.info/info/unicode/category/Lu/list.htm?mode=print' -O uc-lu.htm
wget -c 'http://www.fileformat.info/info/unicode/category/Ll/list.htm?mode=print' -O uc-ll.htm
wget -c 'http://www.fileformat.info/info/unicode/category/Lt/list.htm?mode=print' -O uc-lt.htm
wget -c 'http://www.fileformat.info/info/unicode/category/Lm/list.htm?mode=print' -O uc-lm.htm
wget -c 'http://www.fileformat.info/info/unicode/category/Lo/list.htm?mode=print' -O uc-lo.htm
wget -c 'http://www.fileformat.info/info/unicode/category/Nl/list.htm?mode=print' -O uc-nl.htm

grep --no-filename -o -E "U\+[0-9a-fA-F]+" uc-*.htm | sort > list.txt

0 comments on commit c65678d

Please sign in to comment.