From f6e5cd2d1ef9e4cd226011072e2677575770c073 Mon Sep 17 00:00:00 2001 From: Mark Edgar Date: Wed, 26 Oct 2011 00:46:13 +0200 Subject: [PATCH] Tokenizer changes: * split up the tokenizer regex for readability * ignore inter-token whitespace * allow backslash-escapes in strings * disallow control characters in strings * clean up number regex and don't allow . (or the empty string) * fall-through token-matching case is . leaving the parser to validate it. Parser changes: * disallow non-string keys in objects * detect unrecognized tokens, e.g. @ A $ etc. * consistent quoting on case patterns * use ${token:-EOF} in error messages fixed wrong tests (and removed the wrong-test-generator) added new tests. fix test/valid-test.sh to count failures like test/invalid-test.sh removed unnecessary outlog/errlog files (and add to .gitignore) --- .gitignore | 5 +- all-tests.sh | 2 +- errlog | 1 - parse.sh | 105 +++++++++++------------ test/.generate-valid | 10 --- test/errlog | 1 - test/invalid-test.sh | 18 ++-- test/invalid/bad_unicode_sequence.json | 1 + test/invalid/bareword.json | 1 + test/invalid/bracket_key.json | 1 + test/invalid/colon.json | 1 + test/invalid/colon_obj.json | 3 + test/invalid/control_char_in_string.json | 1 + test/invalid/decimal_point.json | 1 + outlog => test/invalid/empty.json | 0 test/invalid/false_key.json | 1 + test/invalid/null_key.json | 1 + test/invalid/number_key.json | 1 + test/invalid/trailing_garbage.json | 1 + test/invalid/true_key.json | 1 + test/invalid/unclosed_string.json | 1 + test/invalid/weird.json | 1 + test/invalid/weird_key.json | 1 + test/out | 0 test/outlog | 1 - test/tokenizer-test.sh | 4 +- test/valid-test.sh | 38 +++----- test/valid/nested_array.parsed | 4 +- test/valid/nested_object.parsed | 2 +- test/valid/tab_escape.json | 1 + test/valid/tab_escape.parsed | 1 + 31 files changed, 97 insertions(+), 113 deletions(-) delete mode 100644 errlog delete mode 100644 test/.generate-valid delete mode 100644 test/errlog create mode 100644 test/invalid/bad_unicode_sequence.json create mode 100644 test/invalid/bareword.json create mode 100644 test/invalid/bracket_key.json create mode 100644 test/invalid/colon.json create mode 100644 test/invalid/colon_obj.json create mode 100644 test/invalid/control_char_in_string.json create mode 100644 test/invalid/decimal_point.json rename outlog => test/invalid/empty.json (100%) create mode 100644 test/invalid/false_key.json create mode 100644 test/invalid/null_key.json create mode 100644 test/invalid/number_key.json create mode 100644 test/invalid/trailing_garbage.json create mode 100644 test/invalid/true_key.json create mode 100644 test/invalid/unclosed_string.json create mode 100644 test/invalid/weird.json create mode 100644 test/invalid/weird_key.json delete mode 100644 test/out delete mode 100644 test/outlog create mode 100644 test/valid/tab_escape.json create mode 100644 test/valid/tab_escape.parsed diff --git a/.gitignore b/.gitignore index 13abef4..05269a5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ -node_modules -node_modules/* -npm_debug.log +test/errlog +test/outlog diff --git a/all-tests.sh b/all-tests.sh index 6a30b34..4f6596c 100755 --- a/all-tests.sh +++ b/all-tests.sh @@ -27,6 +27,6 @@ done if [ $fail -eq 0 ]; then echo -n 'SUCCESS ' else - echo -n 'FAILOUR ' + echo -n 'FAILURE ' fi echo $passed / $tests diff --git a/errlog b/errlog deleted file mode 100644 index ca28aa1..0000000 --- a/errlog +++ /dev/null @@ -1 +0,0 @@ -/home/dominic/dev/JSON.sh/test/invalid-test.sh: line 10: ../bin/json_parse: No such file or directory diff --git a/parse.sh b/parse.sh index 41eac50..661f25e 100644 --- a/parse.sh +++ b/parse.sh @@ -1,93 +1,90 @@ throw () { - echo $* >&2 + echo "$*" >&2 exit 1 } tokenize () { - egrep -ao '[]|[{}]|:|,|("((\\")|[^"])*")|:|(\-?[0-9]*\.?([0-9]*)?(e?\-?([0-9]*))?)|null|true|false' --color=never + local ESCAPE='(\\[^u[:cntrl:]]|\\u[0-9a-fA-F]{4})' + local CHAR='[^[:cntrl:]"\\]' + local STRING="\"$CHAR*($ESCAPE$CHAR*)*\"" + local NUMBER='-?(0|[1-9][0-9]*)([.][0-9]*)?([eE][+-]?[0-9]*)?' + local KEYWORD='null|false|true' + local SPACE='[[:space:]]+' + egrep -ao "$STRING|$NUMBER|$KEYWORD|$SPACE|." --color=never | + egrep -v "^$SPACE$" # eat whitespace } parse_array () { local index=0 local ary='' - read token + read -r token while true; do - key=$index case "$token" in ']') break ;; - *) - parse_value "$1" "$index" - let index=$index+1 - ary="$ary""$value" - read token - case "$token" in - ']') break ;; - ',') ary="$ary", ;; - *) - if [ "_$token" = _ ]; then token=EOF; fi - throw "EXPECTED ] or , GOT $token" - ;; - esac - read token - ;; esac + parse_value "$1" "$index" + let index=$index+1 + ary="$ary""$value" + read -r token + case "$token" in + ']') break ;; + ',') ary="$ary", ;; + *) throw "EXPECTED ] or , GOT ${token:-EOF}" ;; + esac + read -r token done value=`printf '[%s]' $ary` } parse_object () { - local go=true + local key local obj='' - local EXPECT_COMMA=0 - local EXPECT_COLON=0 - read token - while [ "$go" = true ]; + read -r token + while : do case "$token" in '}') break ;; - *) - - key=$token - read colon - if [ "$colon" != ':' ]; then throw "EXPECTED COLON, GOT $colon"; fi - if [ "_$key" = _ ]; then throw "NULL KEY"; fi - read token - parse_value "$1" "$key" - obj="$obj$key:$value" - - read token - case "$token" in - '}') break;; - ,) obj="$obj,"; read token ;; - *) - if [ "_$token" = _ ]; then token=EOF; fi - throw "EXPECTED , or }, but got $token" - ;; - esac - ;; + '"'*'"') key=$token ;; + *) throw "EXPECTED STRING, GOT ${token:-EOF}" ;; + esac + read -r token + case "$token" in + ':') ;; + *) throw "EXPECTED COLON, GOT ${token:-EOF}" ;; + esac + read -r token + parse_value "$1" "$key" + obj="$obj$key:$value" + read -r token + case "$token" in + '}') break;; + ',') obj="$obj,"; read -r token ;; + *) throw "EXPECTED , or }, but got ${token:-EOF}" ;; esac done value=`printf '{%s}' "$obj"` } parse_value () { - local jpath - - if [ "x$1" = "x" ]; then jpath="$2"; else jpath="$1,$2"; fi - + local jpath="${1:+$1,}$2" case "$token" in - {) parse_object "$jpath" ;; - [) parse_array "$jpath" ;; - ','|'}'|']') throw "EXPECTED value, GOT $token" ;; - *) value=$token - ;; + '{') parse_object "$jpath" ;; + '[') parse_array "$jpath" ;; + # At this point, the only valid single-character tokens are digits. + ''|[^0-9]) throw "EXPECTED value, GOT ${token:-EOF}" ;; + *) value=$token ;; esac printf "[%s]\t%s\n" "$jpath" "$value" } parse () { - read token + read -r token parse_value + read -r token + case "$token" in + '') ;; + *) throw "EXPECTED EOF, GOT $token" ;; + esac } diff --git a/test/.generate-valid b/test/.generate-valid deleted file mode 100644 index b04084b..0000000 --- a/test/.generate-valid +++ /dev/null @@ -1,10 +0,0 @@ -echo "set -e" -for input in valid/*.json -do -expected=${input%.json}.parsed -cat $input | ../bin/json_parse > $expected -echo "# $input" -echo "diff <(cat $input | ../bin/json_parse) ${input%.json}.parsed" -echo "echo OK $input" - -done \ No newline at end of file diff --git a/test/errlog b/test/errlog deleted file mode 100644 index 7774584..0000000 --- a/test/errlog +++ /dev/null @@ -1 +0,0 @@ -EXPECTED , or }, but got EOF diff --git a/test/invalid-test.sh b/test/invalid-test.sh index 9ddb465..92fd897 100755 --- a/test/invalid-test.sh +++ b/test/invalid-test.sh @@ -13,21 +13,19 @@ echo PWD=$PWD fails=0 for input in invalid/* do - cat $input | ../bin/json_parse > outlog 2> errlog - ret=$? - if [ $ret -eq 0 ]; then + if ../bin/json_parse < "$input" > outlog 2> errlog + then echo "NOT OK: cat $input | ../bin/json_parse SHOULD FAIL" echo "OUTPUT WAS >>>" cat outlog echo "<<<" let fails=$fails+1 - else - echo "OK: cat $input | ../bin/json_parse failed correctly" - echo "stderr was >>>" - cat errlog - echo "<<<" +# else +# echo "OK: cat $input | ../bin/json_parse failed correctly" +# echo "stderr was >>>" +# cat errlog +# echo "<<<" fi - done echo "$fails test(s) failed" -exit $fails \ No newline at end of file +exit $fails diff --git a/test/invalid/bad_unicode_sequence.json b/test/invalid/bad_unicode_sequence.json new file mode 100644 index 0000000..a3cd4da --- /dev/null +++ b/test/invalid/bad_unicode_sequence.json @@ -0,0 +1 @@ +"hello\u20world" diff --git a/test/invalid/bareword.json b/test/invalid/bareword.json new file mode 100644 index 0000000..4d75b2d --- /dev/null +++ b/test/invalid/bareword.json @@ -0,0 +1 @@ +bareword diff --git a/test/invalid/bracket_key.json b/test/invalid/bracket_key.json new file mode 100644 index 0000000..684de58 --- /dev/null +++ b/test/invalid/bracket_key.json @@ -0,0 +1 @@ +{[: "bad"} diff --git a/test/invalid/colon.json b/test/invalid/colon.json new file mode 100644 index 0000000..397db75 --- /dev/null +++ b/test/invalid/colon.json @@ -0,0 +1 @@ +: diff --git a/test/invalid/colon_obj.json b/test/invalid/colon_obj.json new file mode 100644 index 0000000..9380111 --- /dev/null +++ b/test/invalid/colon_obj.json @@ -0,0 +1,3 @@ +{ + "hello": : +} diff --git a/test/invalid/control_char_in_string.json b/test/invalid/control_char_in_string.json new file mode 100644 index 0000000..8fdb891 --- /dev/null +++ b/test/invalid/control_char_in_string.json @@ -0,0 +1 @@ +"ab" diff --git a/test/invalid/decimal_point.json b/test/invalid/decimal_point.json new file mode 100644 index 0000000..9c558e3 --- /dev/null +++ b/test/invalid/decimal_point.json @@ -0,0 +1 @@ +. diff --git a/outlog b/test/invalid/empty.json similarity index 100% rename from outlog rename to test/invalid/empty.json diff --git a/test/invalid/false_key.json b/test/invalid/false_key.json new file mode 100644 index 0000000..3541e4a --- /dev/null +++ b/test/invalid/false_key.json @@ -0,0 +1 @@ +{false: "bad"} diff --git a/test/invalid/null_key.json b/test/invalid/null_key.json new file mode 100644 index 0000000..6c2e714 --- /dev/null +++ b/test/invalid/null_key.json @@ -0,0 +1 @@ +{null: "bad"} diff --git a/test/invalid/number_key.json b/test/invalid/number_key.json new file mode 100644 index 0000000..d0daab3 --- /dev/null +++ b/test/invalid/number_key.json @@ -0,0 +1 @@ +{5: "bad"} diff --git a/test/invalid/trailing_garbage.json b/test/invalid/trailing_garbage.json new file mode 100644 index 0000000..33becc9 --- /dev/null +++ b/test/invalid/trailing_garbage.json @@ -0,0 +1 @@ +[1,2,3]' diff --git a/test/invalid/true_key.json b/test/invalid/true_key.json new file mode 100644 index 0000000..093306f --- /dev/null +++ b/test/invalid/true_key.json @@ -0,0 +1 @@ +{true: "bad"} diff --git a/test/invalid/unclosed_string.json b/test/invalid/unclosed_string.json new file mode 100644 index 0000000..8e9add3 --- /dev/null +++ b/test/invalid/unclosed_string.json @@ -0,0 +1 @@ +"Hello world diff --git a/test/invalid/weird.json b/test/invalid/weird.json new file mode 100644 index 0000000..59c227c --- /dev/null +++ b/test/invalid/weird.json @@ -0,0 +1 @@ +@ diff --git a/test/invalid/weird_key.json b/test/invalid/weird_key.json new file mode 100644 index 0000000..c53a30d --- /dev/null +++ b/test/invalid/weird_key.json @@ -0,0 +1 @@ +{@: "bad"} diff --git a/test/out b/test/out deleted file mode 100644 index e69de29..0000000 diff --git a/test/outlog b/test/outlog deleted file mode 100644 index eb404b2..0000000 --- a/test/outlog +++ /dev/null @@ -1 +0,0 @@ -["hELLO"] "goodeoeu" diff --git a/test/tokenizer-test.sh b/test/tokenizer-test.sh index 7bbc850..3a61120 100644 --- a/test/tokenizer-test.sh +++ b/test/tokenizer-test.sh @@ -4,7 +4,7 @@ __filename=`readlink -f $0` __dirname=`dirname $__filename` cd $__dirname -. $__dirname/../parse.sh +. ../parse.sh set -e diff <( echo '"dah"' | tokenize ) <( echo '"dah"' ) @@ -37,4 +37,4 @@ diff <( echo '[ null , -110e10, "null" ]' \ diff <( echo '{"e": false}' | tokenize ) <( printf '{\n"e"\n:\nfalse\n}\n' ) diff <( echo '{"e": "string"}' | tokenize ) <( printf '{\n"e"\n:\n"string"\n}\n' ) -cat ../package.json | tokenize \ No newline at end of file +cat ../package.json | tokenize diff --git a/test/valid-test.sh b/test/valid-test.sh index 30c4de7..c2f7049 100755 --- a/test/valid-test.sh +++ b/test/valid-test.sh @@ -5,30 +5,14 @@ __dirname=`dirname $__filename` cd $__dirname set -e -# valid/array.json -diff <(cat valid/array.json | ../bin/json_parse) valid/array.parsed -echo OK valid/array.json -# valid/empty_array.json -diff <(cat valid/empty_array.json | ../bin/json_parse) valid/empty_array.parsed -echo OK valid/empty_array.json -# valid/empty_object.json -diff <(cat valid/empty_object.json | ../bin/json_parse) valid/empty_object.parsed -echo OK valid/empty_object.json -# valid/many_object.json -diff <(cat valid/many_object.json | ../bin/json_parse) valid/many_object.parsed -echo OK valid/many_object.json -# valid/nested_array.json -diff <(cat valid/nested_array.json | ../bin/json_parse) valid/nested_array.parsed -echo OK valid/nested_array.json -# valid/nested_object.json -diff <(cat valid/nested_object.json | ../bin/json_parse) valid/nested_object.parsed -echo OK valid/nested_object.json -# valid/number.json -diff <(cat valid/number.json | ../bin/json_parse) valid/number.parsed -echo OK valid/number.json -# valid/object.json -diff <(cat valid/object.json | ../bin/json_parse) valid/object.parsed -echo OK valid/object.json -# valid/string.json -diff <(cat valid/string.json | ../bin/json_parse) valid/string.parsed -echo OK valid/string.json +fails=0 +for input in valid/*.json +do + expected="${input%.json}.parsed" + if ! ../bin/json_parse < "$input" | diff -u - "$expected" + then + let fails=$fails+1 + fi +done +echo "$fails test(s) failed" +exit $fails diff --git a/test/valid/nested_array.parsed b/test/valid/nested_array.parsed index eac760b..4638c15 100644 --- a/test/valid/nested_array.parsed +++ b/test/valid/nested_array.parsed @@ -5,5 +5,5 @@ [2,2] {} [2] [4,"hello",{}] [3,"array"] [] -[3] {0:[]} -[] [1,[],[4,"hello",{}],{0:[]}] +[3] {"array":[]} +[] [1,[],[4,"hello",{}],{"array":[]}] diff --git a/test/valid/nested_object.parsed b/test/valid/nested_object.parsed index 6a220a5..8609e30 100644 --- a/test/valid/nested_object.parsed +++ b/test/valid/nested_object.parsed @@ -2,4 +2,4 @@ ["object","empty"] {} ["object"] {"key":"value","empty":{}} ["number"] 5 -[] {"empty":{"key":"value","empty":{}},"number":5} +[] {"object":{"key":"value","empty":{}},"number":5} diff --git a/test/valid/tab_escape.json b/test/valid/tab_escape.json new file mode 100644 index 0000000..b7e42b8 --- /dev/null +++ b/test/valid/tab_escape.json @@ -0,0 +1 @@ +"hello\tworld" diff --git a/test/valid/tab_escape.parsed b/test/valid/tab_escape.parsed new file mode 100644 index 0000000..ee69dd9 --- /dev/null +++ b/test/valid/tab_escape.parsed @@ -0,0 +1 @@ +[] "hello\tworld"