From 4b5895636c1ec06e630baf47881b246c198af056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A9za=20Herman?= Date: Wed, 6 Mar 2024 13:14:50 +0100 Subject: [PATCH] replace jansson parser with a custom one --- src/json.c | 1288 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 1046 insertions(+), 242 deletions(-) diff --git a/src/json.c b/src/json.c index e849ccaf7222..58aab0c45c3c 100644 --- a/src/json.c +++ b/src/json.c @@ -23,6 +23,7 @@ along with GNU Emacs. If not, see . */ #include #include #include +#include #include @@ -237,41 +238,6 @@ json_out_of_memory (void) xsignal0 (Qjson_out_of_memory); } -/* Signal a Lisp error corresponding to the JSON ERROR. */ - -static AVOID -json_parse_error (const json_error_t *error) -{ - Lisp_Object symbol; -#if JSON_HAS_ERROR_CODE - switch (json_error_code (error)) - { - case json_error_premature_end_of_input: - symbol = Qjson_end_of_file; - break; - case json_error_end_of_input_expected: - symbol = Qjson_trailing_content; - break; - default: - symbol = Qjson_parse_error; - break; - } -#else - if (json_has_suffix (error->text, "expected near end of file")) - symbol = Qjson_end_of_file; - else if (json_has_prefix (error->text, "end of file expected")) - symbol = Qjson_trailing_content; - else - symbol = Qjson_parse_error; -#endif - xsignal (symbol, - list5 (build_string_from_utf8 (error->text), - build_string_from_utf8 (error->source), - INT_TO_INTEGER (error->line), - INT_TO_INTEGER (error->column), - INT_TO_INTEGER (error->position))); -} - static void json_release_object (void *object) { @@ -794,145 +760,1009 @@ usage: (json-insert OBJECT &rest ARGS) */) return unbind_to (count, Qnil); } -/* Convert a JSON object to a Lisp object. */ +struct json_parser +{ + /* Because of a possible gap in the input (an emacs buffer can have + a gap), the input is described by [input_begin;input_end) and + [secondary_input_begin;secondary_input_end). If the input is + continuous, then secondary_input_begin and secondary_input_end + should be NULL */ + const unsigned char *input_current; + const unsigned char *input_begin; + const unsigned char *input_end; + + const unsigned char *secondary_input_begin; + const unsigned char *secondary_input_end; + + int current_line; + int current_column; + + /* The parser has a maximum allowed depth. available_depth + decreases at each object/array begin. If reaches zero, then an + error is generated */ + int available_depth; + + struct json_configuration conf; + + size_t additional_bytes_count; + + /* Lisp_Objects are collected in this area during object/array + parsing */ + Lisp_Object *object_workspace; + Lisp_Object *object_workspace_end; + Lisp_Object *object_workspace_current; + + /* String and number parsing uses this workspace */ + unsigned char *byte_workspace; + unsigned char *byte_workspace_end; + unsigned char *byte_workspace_current; +}; + +static AVOID +json_signal_error (struct json_parser *parser, Lisp_Object error) +{ + xsignal2 (error, INT_TO_INTEGER (parser->current_line), + INT_TO_INTEGER (parser->current_column)); +} + +static void +json_parser_init (struct json_parser *parser, + struct json_configuration conf, + const unsigned char *input, + const unsigned char *input_end, + const unsigned char *secondary_input, + const unsigned char *secondary_input_end) +{ + const int initial_workspace_size = 64; + const int initial_string_workspace_size = 512; + + if (secondary_input >= secondary_input_end) + { + secondary_input = NULL; + secondary_input_end = NULL; + } + + if (input < input_end) + { + parser->input_begin = input; + parser->input_end = input_end; + + parser->secondary_input_begin = secondary_input; + parser->secondary_input_end = secondary_input_end; + } + else + { + parser->input_begin = secondary_input; + parser->input_end = secondary_input_end; + + parser->secondary_input_begin = NULL; + parser->secondary_input_end = NULL; + } + + parser->input_current = parser->input_begin; + + parser->current_line = 1; + parser->current_column = 0; + parser->available_depth = 10000; + parser->conf = conf; + + parser->additional_bytes_count = 0; + + parser->object_workspace = + xnmalloc (initial_workspace_size, sizeof (Lisp_Object)); + parser->object_workspace_end = + parser->object_workspace + initial_workspace_size; + parser->object_workspace_current = parser->object_workspace; + + parser->byte_workspace = xmalloc (initial_string_workspace_size); + parser->byte_workspace_end = + parser->byte_workspace + initial_string_workspace_size; +} + +static void +json_parser_done (void *parser) +{ + struct json_parser *p = (struct json_parser *) parser; + xfree (p->object_workspace); + xfree (p->byte_workspace); +} + +/* Makes sure that the object_workspace has 'size' available space for + Lisp_Objects */ +static void +json_make_object_workspace_for (struct json_parser *parser, + size_t size) +{ + size_t available_size = + parser->object_workspace_end - parser->object_workspace_current; + if (available_size >= size) + { + return; + } + size_t needed_workspace_size = (parser->object_workspace_current + - parser->object_workspace + size); + size_t new_workspace_size = + parser->object_workspace_end - parser->object_workspace; + while (new_workspace_size < needed_workspace_size) + { + if (ckd_mul(&new_workspace_size, new_workspace_size, 2)) + { + json_signal_error(parser, Qjson_out_of_memory); + } + } + size_t offset = + parser->object_workspace_current - parser->object_workspace; + parser->object_workspace = + xnrealloc (parser->object_workspace, + new_workspace_size, sizeof (Lisp_Object)); + parser->object_workspace_end = + parser->object_workspace + new_workspace_size; + parser->object_workspace_current = + parser->object_workspace + offset; +} + +static void +json_byte_workspace_reset (struct json_parser *parser) +{ + parser->byte_workspace_current = parser->byte_workspace; +} + +/* Puts 'value' into the byte_workspace. If there is no space + available, it allocates space */ +static void +json_byte_workspace_put (struct json_parser *parser, + unsigned char value) +{ + if (parser->byte_workspace_current < parser->byte_workspace_end) + { + *parser->byte_workspace_current++ = value; + return; + } + + size_t new_workspace_size = + parser->byte_workspace_end - parser->byte_workspace; + if (ckd_mul(&new_workspace_size, new_workspace_size, 2)) + { + json_signal_error(parser, Qjson_out_of_memory); + } + + size_t offset = + parser->byte_workspace_current - parser->byte_workspace; + parser->byte_workspace = + xrealloc (parser->byte_workspace, new_workspace_size); + parser->byte_workspace_end = + parser->byte_workspace + new_workspace_size; + parser->byte_workspace_current = + parser->byte_workspace + offset; + *parser->byte_workspace_current++ = value; +} + +static bool +json_input_at_eof (struct json_parser *parser) +{ + if (parser->input_current < parser->input_end) + return false; + return parser->secondary_input_end == NULL; +} + +/* If there is a secondary buffer, it switches to it */ +static int +json_input_switch_to_secondary (struct json_parser *parser) +{ + if (parser->secondary_input_begin < parser->secondary_input_end) + { + parser->additional_bytes_count = + parser->input_end - parser->input_begin; + parser->input_begin = parser->secondary_input_begin; + parser->input_end = parser->secondary_input_end; + parser->input_current = parser->secondary_input_begin; + parser->secondary_input_begin = NULL; + parser->secondary_input_end = NULL; + return 0; + } + else + return -1; +} + +/* Reads a byte from the JSON input stream */ +static unsigned char +json_input_get (struct json_parser *parser) +{ + if (parser->input_current >= parser->input_end + && json_input_switch_to_secondary (parser) < 0) + json_signal_error (parser, Qjson_end_of_file); + return *parser->input_current++; +} + +/* Reads a byte from the JSON input stream, if the stream is not at + * eof. At eof, returns -1 */ +static int +json_input_get_if_possible (struct json_parser *parser) +{ + if (parser->input_current >= parser->input_end + && json_input_switch_to_secondary (parser) < 0) + return -1; + return *parser->input_current++; +} + +/* Puts back the last read input byte. Only one byte can be put back, + because otherwise this code would need to handle switching from + the secondary buffer to the initial */ +static void +json_input_put_back (struct json_parser *parser) +{ + parser->input_current--; +} + +static bool +json_skip_whitespace_internal (struct json_parser *parser, int c) +{ + parser->current_column++; + if (c == 0x20 || c == 0x09 || c == 0x0d) + return false; + else if (c == 0x0a) + { + parser->current_line++; + parser->current_column = 0; + return false; + } + else + return true; +} + +/* Skips JSON whitespace, and returns with the first non-whitespace + * character */ +static int +json_skip_whitespace (struct json_parser *parser) +{ + for (;;) + { + int c = json_input_get (parser); + if (json_skip_whitespace_internal (parser, c)) + return c; + } +} + +/* Skips JSON whitespace, and returns with the first non-whitespace + * character, if possible. If there is no non-whitespace character + * (because we reached the end), it returns -1 */ +static int +json_skip_whitespace_if_possible (struct json_parser *parser) +{ + for (;;) + { + int c = json_input_get_if_possible (parser); + if (c < 0) + return c; + if (json_skip_whitespace_internal (parser, c)) + return c; + } +} + +static int +json_hex_value (int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + else + return -1; +} + +/* Parses the CCCC part of the unicode escape sequence \uCCCC */ +static int +json_parse_unicode (struct json_parser *parser) +{ + unsigned char v[4]; + for (int i = 0; i < 4; i++) + { + int c = json_hex_value (json_input_get (parser)); + parser->current_column++; + if (c < 0) + json_signal_error (parser, Qjson_escape_sequence_error); + v[i] = c; + } + + return v[0] << 12 | v[1] << 8 | v[2] << 4 | v[3]; +} + +/* Parses an utf-8 code-point encoding (except the first byte), and + returns the numeric value of the code-point (without considering + the first byte) */ +static int +json_handle_utf8_tail_bytes (struct json_parser *parser, int n) +{ + int v = 0; + for (int i = 0; i < n; i++) + { + int c = json_input_get (parser); + json_byte_workspace_put (parser, c); + if ((c & 0xc0) != 0x80) + json_signal_error (parser, Qjson_utf8_decode_error); + v = (v << 6) | (c & 0x3f); + } + return v; +} + +/* Reads a JSON string, and puts the result into the byte workspace */ +static void +json_parse_string (struct json_parser *parser) +{ + /* a single_uninteresting byte can be simply copied from the input + to output, it doesn't need any extra care. This means all the + characters between [0x20;0x7f], except the double quote and + the backslash */ + static const char is_single_uninteresting[256] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 2 */ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 3 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 4 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 5 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 7 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + for (;;) + { + /* This if is only here for a possible speedup. If there are 4 + bytes available, and all of them are single_uninteresting, + then we can just copy these 4 bytes to output */ + if (parser->input_end - parser->input_current >= 4) + { + int c0 = parser->input_current[0]; + int c1 = parser->input_current[1]; + int c2 = parser->input_current[2]; + int c3 = parser->input_current[3]; + bool v0 = is_single_uninteresting[c0]; + bool v1 = is_single_uninteresting[c1]; + bool v2 = is_single_uninteresting[c2]; + bool v3 = is_single_uninteresting[c3]; + if (v0 && v1 && v2 && v3) + { + json_byte_workspace_put (parser, c0); + json_byte_workspace_put (parser, c1); + json_byte_workspace_put (parser, c2); + json_byte_workspace_put (parser, c3); + parser->input_current += 4; + parser->current_column += 4; + continue; + } + } + + int c = json_input_get (parser); + parser->current_column++; + if (is_single_uninteresting[c]) + { + json_byte_workspace_put (parser, c); + continue; + } + + if (c == '"') + return; + else if (c & 0x80) + { + /* Handle utf-8 encoding */ + json_byte_workspace_put (parser, c); + if (c < 0xc0) + json_signal_error (parser, Qjson_utf8_decode_error); + else if (c < 0xe0) + { + int n = ((c & 0x1f) << 6 + | json_handle_utf8_tail_bytes (parser, 1)); + if (n < 0x80) + json_signal_error (parser, Qjson_utf8_decode_error); + } + else if (c < 0xf0) + { + int n = ((c & 0xf) << 12 + | json_handle_utf8_tail_bytes (parser, 2)); + if (n < 0x800 || (n >= 0xd800 && n < 0xe000)) + json_signal_error (parser, Qjson_utf8_decode_error); + } + else if (c < 0xf8) + { + int n = ((c & 0x7) << 18 + | json_handle_utf8_tail_bytes (parser, 3)); + if (n < 0x10000 || n > 0x10ffff) + json_signal_error (parser, Qjson_utf8_decode_error); + } + else + json_signal_error (parser, Qjson_utf8_decode_error); + } + else if (c == '\\') + { + /* Handle escape sequences */ + c = json_input_get (parser); + parser->current_column++; + if (c == '"') + json_byte_workspace_put (parser, '"'); + else if (c == '\\') + json_byte_workspace_put (parser, '\\'); + else if (c == '/') + json_byte_workspace_put (parser, '/'); + else if (c == 'b') + json_byte_workspace_put (parser, '\b'); + else if (c == 'f') + json_byte_workspace_put (parser, '\f'); + else if (c == 'n') + json_byte_workspace_put (parser, '\n'); + else if (c == 'r') + json_byte_workspace_put (parser, '\r'); + else if (c == 't') + json_byte_workspace_put (parser, '\t'); + else if (c == 'u') + { + int num = json_parse_unicode (parser); + /* is the first half of the surrogate pair */ + if (num >= 0xd800 && num < 0xdc00) + { + parser->current_column++; + if (json_input_get (parser) != '\\') + json_signal_error (parser, + Qjson_invalid_surrogate_error); + parser->current_column++; + if (json_input_get (parser) != 'u') + json_signal_error (parser, + Qjson_invalid_surrogate_error); + int num2 = json_parse_unicode (parser); + if (num2 < 0xdc00 || num2 >= 0xe000) + json_signal_error (parser, + Qjson_invalid_surrogate_error); + num = (0x10000 + + ((num - 0xd800) << 10 | (num2 - 0xdc00))); + } + else if (num >= 0xdc00 && num < 0xe000) + /* is the second half of the surrogate pair without + the first half */ + json_signal_error (parser, + Qjson_invalid_surrogate_error); + + /* utf-8 encode the code-point */ + if (num < 0x80) + json_byte_workspace_put (parser, num); + else if (num < 0x800) + { + json_byte_workspace_put (parser, 0xc0 | num >> 6); + json_byte_workspace_put (parser, + 0x80 | (num & 0x3f)); + } + else if (num < 0x10000) + { + json_byte_workspace_put (parser, 0xe0 | num >> 12); + json_byte_workspace_put (parser, + (0x80 + | ((num >> 6) & 0x3f))); + json_byte_workspace_put (parser, + 0x80 | (num & 0x3f)); + } + else + { + json_byte_workspace_put (parser, 0xf0 | num >> 18); + json_byte_workspace_put (parser, + (0x80 + | ((num >> 12) & 0x3f))); + json_byte_workspace_put (parser, + (0x80 + | ((num >> 6) & 0x3f))); + json_byte_workspace_put (parser, + 0x80 | (num & 0x3f)); + } + } + else + json_signal_error (parser, Qjson_escape_sequence_error); + } + else + json_signal_error (parser, Qjson_parse_error); + } +} + +/* If there was no integer overflow during parsing the integer, this + puts 'value' to the output. Otherwise this calls string_to_number + to parse integer on the byte workspace. This could just always + call string_to_number, but for performance reasons, during parsing + the code tries to calculate the value, so in most cases, we can + save call of string_to_number */ +static Lisp_Object +json_create_integer (struct json_parser *parser, bool integer_overflow, + bool negative, EMACS_UINT value) +{ + if (!integer_overflow) + { + if (negative) + { + uintmax_t v = value; + if (v <= (uintmax_t) INTMAX_MAX + 1) + return INT_TO_INTEGER ((intmax_t) -v); + } + else + return INT_TO_INTEGER (value); + } + + json_byte_workspace_put (parser, 0); + ptrdiff_t len; + Lisp_Object result = + string_to_number ((const char *) parser->byte_workspace, 10, + &len); + if (len + != parser->byte_workspace_current - parser->byte_workspace - 1) + json_signal_error (parser, Qjson_error); + return result; +} + +/* Parses a float using the byte workspace */ +static Lisp_Object +json_create_float (struct json_parser *parser) +{ + json_byte_workspace_put (parser, 0); + errno = 0; + char *e; + double value = strtod ((const char *) parser->byte_workspace, &e); + bool out_of_range = + (errno != 0 && (value == HUGE_VAL || value == -HUGE_VAL)); + if (out_of_range) + json_signal_error (parser, Qjson_number_out_of_range); + else if ((const unsigned char *) e + != parser->byte_workspace_current - 1) + json_signal_error (parser, Qjson_error); + else + return make_float (value); +} + +/* Parses a number. The first character is the input parameter 'c'. + */ +static Lisp_Object +json_parse_number (struct json_parser *parser, int c) +{ + json_byte_workspace_reset (parser); + json_byte_workspace_put (parser, c); + + bool negative = false; + if (c == '-') + { + negative = true; + c = json_input_get (parser); + json_byte_workspace_put (parser, c); + parser->current_column++; + } + if (c < '0' || c > '9') + json_signal_error (parser, Qjson_parse_error); + + /* The idea is that during finding the last character of the + number, the for loop below also tries to calculate the value. If + the parsed number is an integer which fits into unsigned long, + then the parser can use the value of 'integer' right away, + instead of having to re-parse the byte workspace later. + Ideally, this integer should have the same size as a CPU general + purpose register. */ + EMACS_UINT integer = c - '0'; + bool integer_overflow = false; + + if (integer == 0) + { + if (json_input_at_eof (parser)) + return INT_TO_INTEGER (0); + c = json_input_get (parser); + } + else + { + for (;;) + { + if (json_input_at_eof (parser)) + return json_create_integer (parser, integer_overflow, + negative, integer); + c = json_input_get (parser); + if (c < '0' || c > '9') + break; + json_byte_workspace_put (parser, c); + parser->current_column++; + + integer_overflow |= ckd_mul (&integer, integer, 10); + integer_overflow |= ckd_add (&integer, integer, c - '0'); + } + } + + bool is_float = false; + if (c == '.') + { + json_byte_workspace_put (parser, c); + parser->current_column++; + + is_float = true; + c = json_input_get (parser); + json_byte_workspace_put (parser, c); + parser->current_column++; + if (c < '0' || c > '9') + json_signal_error (parser, Qjson_parse_error); + for (;;) + { + if (json_input_at_eof (parser)) + return json_create_float (parser); + c = json_input_get (parser); + if (c < '0' || c > '9') + break; + json_byte_workspace_put (parser, c); + parser->current_column++; + } + } + if (c == 'e' || c == 'E') + { + json_byte_workspace_put (parser, c); + parser->current_column++; + + is_float = true; + c = json_input_get (parser); + json_byte_workspace_put (parser, c); + parser->current_column++; + if (c == '-' || c == '+') + { + c = json_input_get (parser); + json_byte_workspace_put (parser, c); + parser->current_column++; + } + if (c < '0' || c > '9') + json_signal_error (parser, Qjson_parse_error); + for (;;) + { + if (json_input_at_eof (parser)) + return json_create_float (parser); + c = json_input_get (parser); + if (c < '0' || c > '9') + break; + json_byte_workspace_put (parser, c); + parser->current_column++; + } + } + + /* 'c' contains a character which is not part of the number, + so it is need to be put back */ + json_input_put_back (parser); + + if (is_float) + return json_create_float (parser); + else + return json_create_integer (parser, integer_overflow, negative, + integer); +} + +static Lisp_Object json_parse_value (struct json_parser *parser, + int c); + +/* Parses a JSON array. */ +static Lisp_Object +json_parse_array (struct json_parser *parser) +{ + int c = json_skip_whitespace (parser); + + const size_t begin_offset = + parser->object_workspace_current - parser->object_workspace; + + if (c != ']') + { + parser->available_depth--; + if (parser->available_depth < 0) + json_signal_error (parser, Qjson_object_too_deep); + + size_t number_of_elements = 0; + /* This loop collects the array elements in the object workspace + */ + for (;;) + { + Lisp_Object element = json_parse_value (parser, c); + json_make_object_workspace_for (parser, 1); + *parser->object_workspace_current++ = element; + + c = json_skip_whitespace (parser); + + number_of_elements++; + if (c == ']') + { + parser->available_depth++; + break; + } + + if (c != ',') + json_signal_error (parser, Qjson_parse_error); + + c = json_skip_whitespace (parser); + } + } + + Lisp_Object result; + const Lisp_Object *b = parser->object_workspace + begin_offset; + size_t number_of_elements = parser->object_workspace_current - b; + + switch (parser->conf.array_type) + { + case json_array_array: + result = make_vector (number_of_elements, Qunbound); + for (size_t i = 0; i < number_of_elements; i++) + { + rarely_quit (i); + ASET (result, i, *b++); + } + break; + case json_array_list: + b = parser->object_workspace_current; + result = Qnil; + for (size_t i = 0; i < number_of_elements; ++i) + { + rarely_quit (i); + result = Fcons (*(--b), result); + } + break; + default: + emacs_abort (); + } -static Lisp_Object ARG_NONNULL ((1)) -json_to_lisp (json_t *json, const struct json_configuration *conf) + parser->object_workspace_current = + parser->object_workspace + begin_offset; + + return result; +} + +/* Parses a JSON object. */ +static Lisp_Object +json_parse_object (struct json_parser *parser) { - switch (json_typeof (json)) + int c = json_skip_whitespace (parser); + + const size_t begin_offset = + parser->object_workspace_current - parser->object_workspace; + + if (c != '}') { - case JSON_NULL: - return conf->null_object; - case JSON_FALSE: - return conf->false_object; - case JSON_TRUE: - return Qt; - case JSON_INTEGER: + parser->available_depth--; + if (parser->available_depth < 0) + json_signal_error (parser, Qjson_object_too_deep); + + /* This loop collects the object members (key/value pairs) in + * the object workspace */ + for (;;) + { + if (c != '"') + json_signal_error (parser, Qjson_parse_error); + + Lisp_Object key; + json_byte_workspace_reset (parser); + switch (parser->conf.object_type) + { + case json_object_hashtable: + { + json_parse_string (parser); + key = + make_string_from_utf8 ((char *)parser->byte_workspace, + (parser->byte_workspace_current + - parser->byte_workspace)); + break; + } + case json_object_alist: + { + json_parse_string (parser); + key = Fintern (make_string_from_utf8 ((char *) parser->byte_workspace, + (parser->byte_workspace_current + - parser->byte_workspace)), + Qnil); + break; + } + case json_object_plist: + { + json_byte_workspace_put (parser, ':'); + json_parse_string (parser); + key = intern_1 ((char *) parser->byte_workspace, + (parser->byte_workspace_current + - parser->byte_workspace)); + break; + } + default: + emacs_abort (); + } + + c = json_skip_whitespace (parser); + if (c != ':') + json_signal_error (parser, Qjson_parse_error); + + c = json_skip_whitespace (parser); + + Lisp_Object value = json_parse_value (parser, c); + + json_make_object_workspace_for (parser, 2); + *parser->object_workspace_current++ = key; + *parser->object_workspace_current++ = value; + + c = json_skip_whitespace (parser); + + if (c == '}') + { + parser->available_depth++; + break; + } + + if (c != ',') + json_signal_error (parser, Qjson_parse_error); + + c = json_skip_whitespace (parser); + } + } + + Lisp_Object result; + switch (parser->conf.object_type) + { + case json_object_hashtable: { - json_int_t i = json_integer_value (json); - return INT_TO_INTEGER (i); + Lisp_Object *end = parser->object_workspace_current; + Lisp_Object *member = parser->object_workspace + begin_offset; + result = CALLN (Fmake_hash_table, QCtest, Qequal, QCsize, + make_fixed_natnum ((end - member)/2)); + struct Lisp_Hash_Table *h = XHASH_TABLE (result); + while (member < end) + { + hash_hash_t hash; + ptrdiff_t i = hash_lookup_get_hash (h, member[0], &hash); + if (i < 0) + hash_put (h, member[0], member[1], hash); + else + set_hash_value_slot (h, i, member[1]); + + member += 2; + } + break; } - case JSON_REAL: - return make_float (json_real_value (json)); - case JSON_STRING: - return make_string_from_utf8 (json_string_value (json), - json_string_length (json)); - case JSON_ARRAY: + case json_object_alist: { - if (++lisp_eval_depth > max_lisp_eval_depth) - xsignal0 (Qjson_object_too_deep); - size_t size = json_array_size (json); - if (PTRDIFF_MAX < size) - overflow_error (); - Lisp_Object result; - switch (conf->array_type) - { - case json_array_array: - { - result = make_vector (size, Qunbound); - for (ptrdiff_t i = 0; i < size; ++i) - { - rarely_quit (i); - ASET (result, i, - json_to_lisp (json_array_get (json, i), conf)); - } - break; - } - case json_array_list: - { - result = Qnil; - for (ptrdiff_t i = size - 1; i >= 0; --i) - { - rarely_quit (i); - result = Fcons (json_to_lisp (json_array_get (json, i), conf), - result); - } - break; - } - default: - /* Can't get here. */ - emacs_abort (); - } - --lisp_eval_depth; - return result; + Lisp_Object *member = parser->object_workspace_current; + Lisp_Object *begin = parser->object_workspace + begin_offset; + result = Qnil; + while (member > begin) + { + member -= 2; + result = Fcons (Fcons (member[0], member[1]), result); + } + break; } - case JSON_OBJECT: + case json_object_plist: { - if (++lisp_eval_depth > max_lisp_eval_depth) - xsignal0 (Qjson_object_too_deep); - Lisp_Object result; - switch (conf->object_type) - { - case json_object_hashtable: - { - size_t size = json_object_size (json); - if (FIXNUM_OVERFLOW_P (size)) - overflow_error (); - result = CALLN (Fmake_hash_table, QCtest, Qequal, QCsize, - make_fixed_natnum (size)); - struct Lisp_Hash_Table *h = XHASH_TABLE (result); - const char *key_str; - json_t *value; - json_object_foreach (json, key_str, value) - { - Lisp_Object key = build_string_from_utf8 (key_str); - hash_hash_t hash; - ptrdiff_t i = hash_lookup_get_hash (h, key, &hash); - /* Keys in JSON objects are unique, so the key can't - be present yet. */ - eassert (i < 0); - hash_put (h, key, json_to_lisp (value, conf), hash); - } - break; - } - case json_object_alist: - { - result = Qnil; - const char *key_str; - json_t *value; - json_object_foreach (json, key_str, value) - { - Lisp_Object key - = Fintern (build_string_from_utf8 (key_str), Qnil); - result - = Fcons (Fcons (key, json_to_lisp (value, conf)), - result); - } - result = Fnreverse (result); - break; - } - case json_object_plist: - { - result = Qnil; - const char *key_str; - json_t *value; - json_object_foreach (json, key_str, value) - { - USE_SAFE_ALLOCA; - ptrdiff_t key_str_len = strlen (key_str); - char *keyword_key_str = SAFE_ALLOCA (1 + key_str_len + 1); - keyword_key_str[0] = ':'; - strcpy (&keyword_key_str[1], key_str); - Lisp_Object key = intern_1 (keyword_key_str, key_str_len + 1); - /* Build the plist as value-key since we're going to - reverse it in the end.*/ - result = Fcons (key, result); - result = Fcons (json_to_lisp (value, conf), result); - SAFE_FREE (); - } - result = Fnreverse (result); - break; - } - default: - /* Can't get here. */ - emacs_abort (); - } - --lisp_eval_depth; - return result; + Lisp_Object *member = parser->object_workspace_current; + Lisp_Object *begin = parser->object_workspace + begin_offset; + result = Qnil; + while (member > begin) + { + member -= 2; + result = Fcons (member[1], result); + result = Fcons (member[0], result); + } + break; } + default: + emacs_abort (); } - /* Can't get here. */ - emacs_abort (); + + parser->object_workspace_current = + parser->object_workspace + begin_offset; + + return result; +} + +/* Token-char is not a JSON terminology. When parsing + null/false/true, this function tells the character set that is need + to be considered as part of a token. For example, if the input is + "truesomething", then the parser shouldn't consider it as "true", + and an additional later "something" token. An additional example: + if the input is "truetrue", then calling (json-parse-buffer) twice + shouldn't produce two successful calls which return t, but a + parsing error */ +static bool +json_is_token_char (int c) +{ + return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') + || (c >= '0' && c <= '9') || (c == '-')); +} + +/* This is the entry point to the value parser, this parses a JSON + * value */ +Lisp_Object +json_parse_value (struct json_parser *parser, int c) +{ + if (c == '{') + return json_parse_object (parser); + else if (c == '[') + return json_parse_array (parser); + else if (c == '"') + { + json_byte_workspace_reset (parser); + json_parse_string (parser); + Lisp_Object result = + make_string_from_utf8 ((const char *) parser->byte_workspace, + (parser->byte_workspace_current + - parser->byte_workspace)); + return result; + } + else if ((c >= '0' && c <= '9') || (c == '-')) + return json_parse_number (parser, c); + else + { + int c2 = json_input_get (parser); + int c3 = json_input_get (parser); + int c4 = json_input_get (parser); + int c5 = json_input_get_if_possible (parser); + + if (c == 't' && c2 == 'r' && c3 == 'u' && c4 == 'e' + && (c5 < 0 || !json_is_token_char (c5))) + { + if (c5 >= 0) + json_input_put_back (parser); + parser->current_column += 4; + return Qt; + } + if (c == 'n' && c2 == 'u' && c3 == 'l' && c4 == 'l' + && (c5 < 0 || !json_is_token_char (c5))) + { + if (c5 >= 0) + json_input_put_back (parser); + parser->current_column += 4; + return parser->conf.null_object; + } + if (c == 'f' && c2 == 'a' && c3 == 'l' && c4 == 's' + && c5 == 'e') + { + int c6 = json_input_get_if_possible (parser); + if (c6 < 0 || !json_is_token_char (c6)) + { + if (c6 >= 0) + json_input_put_back (parser); + parser->current_column += 5; + return parser->conf.false_object; + } + } + + json_signal_error (parser, Qjson_parse_error); + } +} + +enum ParseEndBehavior + { + PARSEENDBEHAVIOR_CheckForGarbage, + PARSEENDBEHAVIOR_MovePoint + }; + +static Lisp_Object +json_parse (struct json_parser *parser, + enum ParseEndBehavior parse_end_behavior) +{ + int c = json_skip_whitespace (parser); + + Lisp_Object result = json_parse_value (parser, c); + + switch (parse_end_behavior) + { + case PARSEENDBEHAVIOR_CheckForGarbage: + c = json_skip_whitespace_if_possible (parser); + if (c >= 0) + json_signal_error (parser, Qjson_trailing_content); + break; + case PARSEENDBEHAVIOR_MovePoint: + { + ptrdiff_t point = (PT_BYTE + parser->input_current + - parser->input_begin + + parser->additional_bytes_count); + SET_PT_BOTH (BYTE_TO_CHAR (point), point); + break; + } + } + + return result; } DEFUN ("json-parse-string", Fjson_parse_string, Sjson_parse_string, 1, MANY, @@ -950,7 +1780,9 @@ The arguments ARGS are a list of keyword/argument pairs: The keyword argument `:object-type' specifies which Lisp type is used to represent objects; it can be `hash-table', `alist' or `plist'. It -defaults to `hash-table'. +defaults to `hash-table'. If an object has members with the same +key, `hash-table' keeps only the last value of such keys, while +`alist' and `plist' keep all the members. The keyword argument `:array-type' specifies which Lisp type is used to represent arrays; it can be `array' (the default) or `list'. @@ -961,62 +1793,28 @@ to represent a JSON null value. It defaults to `:null'. The keyword argument `:false-object' specifies which object to use to represent a JSON false value. It defaults to `:false'. usage: (json-parse-string STRING &rest ARGS) */) - (ptrdiff_t nargs, Lisp_Object *args) +(ptrdiff_t nargs, Lisp_Object *args) { specpdl_ref count = SPECPDL_INDEX (); -#ifdef WINDOWSNT - ensure_json_available (); -#endif - Lisp_Object string = args[0]; CHECK_STRING (string); Lisp_Object encoded = json_encode (string); - check_string_without_embedded_nulls (encoded); - struct json_configuration conf = - {json_object_hashtable, json_array_array, QCnull, QCfalse}; + struct json_configuration conf = { json_object_hashtable, + json_array_array, QCnull, + QCfalse }; json_parse_args (nargs - 1, args + 1, &conf, true); - json_error_t error; - json_t *object - = json_loads (SSDATA (encoded), JSON_DECODE_ANY | JSON_ALLOW_NUL, &error); - if (object == NULL) - json_parse_error (&error); - - /* Avoid leaking the object in case of further errors. */ - if (object != NULL) - record_unwind_protect_ptr (json_release_object, object); + struct json_parser p; + const unsigned char *begin = + (const unsigned char *) SSDATA (encoded); + json_parser_init (&p, conf, begin, begin + SBYTES (encoded), NULL, + NULL); + record_unwind_protect_ptr (json_parser_done, &p); - return unbind_to (count, json_to_lisp (object, &conf)); -} - -struct json_read_buffer_data -{ - /* Byte position of position to read the next chunk from. */ - ptrdiff_t point; -}; - -/* Callback for json_load_callback that reads from the current buffer. - DATA must point to a structure of type json_read_buffer_data. - data->point must point to the byte position to read from; after - reading, data->point is advanced accordingly. The buffer point - itself is ignored. This function may not exit nonlocally. */ - -static size_t -json_read_buffer_callback (void *buffer, size_t buflen, void *data) -{ - struct json_read_buffer_data *d = data; - - /* First, parse from point to the gap or the end of the accessible - portion, whatever is closer. */ - ptrdiff_t point = d->point; - ptrdiff_t end = BUFFER_CEILING_OF (point) + 1; - ptrdiff_t count = end - point; - if (buflen < count) - count = buflen; - memcpy (buffer, BYTE_POS_ADDR (point), count); - d->point += count; - return count; + return unbind_to (count, + json_parse (&p, + PARSEENDBEHAVIOR_CheckForGarbage)); } DEFUN ("json-parse-buffer", Fjson_parse_buffer, Sjson_parse_buffer, @@ -1038,7 +1836,9 @@ The arguments ARGS are a list of keyword/argument pairs: The keyword argument `:object-type' specifies which Lisp type is used to represent objects; it can be `hash-table', `alist' or `plist'. It -defaults to `hash-table'. +defaults to `hash-table'. If an object has members with the same +key, `hash-table' keeps only the last value of such keys, while +`alist' and `plist' keep all the members. The keyword argument `:array-type' specifies which Lisp type is used to represent arrays; it can be `array' (the default) or `list'. @@ -1049,42 +1849,34 @@ to represent a JSON null value. It defaults to `:null'. The keyword argument `:false-object' specifies which object to use to represent a JSON false value. It defaults to `:false'. usage: (json-parse-buffer &rest args) */) - (ptrdiff_t nargs, Lisp_Object *args) +(ptrdiff_t nargs, Lisp_Object *args) { specpdl_ref count = SPECPDL_INDEX (); -#ifdef WINDOWSNT - ensure_json_available (); -#endif - - struct json_configuration conf = - {json_object_hashtable, json_array_array, QCnull, QCfalse}; + struct json_configuration conf = { json_object_hashtable, + json_array_array, QCnull, + QCfalse }; json_parse_args (nargs, args, &conf, true); - ptrdiff_t point = PT_BYTE; - struct json_read_buffer_data data = {.point = point}; - json_error_t error; - json_t *object - = json_load_callback (json_read_buffer_callback, &data, - JSON_DECODE_ANY - | JSON_DISABLE_EOF_CHECK - | JSON_ALLOW_NUL, - &error); - - if (object == NULL) - json_parse_error (&error); - - /* Avoid leaking the object in case of further errors. */ - record_unwind_protect_ptr (json_release_object, object); - - /* Convert and then move point only if everything succeeded. */ - Lisp_Object lisp = json_to_lisp (object, &conf); + struct json_parser p; + unsigned char *begin = PT_ADDR; + unsigned char *end = GPT_ADDR; + unsigned char *secondary_begin = NULL; + unsigned char *secondary_end = NULL; + if (GPT_ADDR < Z_ADDR) + { + secondary_begin = GAP_END_ADDR; + if (secondary_begin < PT_ADDR) + secondary_begin = PT_ADDR; + secondary_end = Z_ADDR; + } - /* Adjust point by how much we just read. */ - point += error.position; - SET_PT_BOTH (BYTE_TO_CHAR (point), point); + json_parser_init (&p, conf, begin, end, secondary_begin, + secondary_end); + record_unwind_protect_ptr (json_parser_done, &p); - return unbind_to (count, lisp); + return unbind_to (count, + json_parse (&p, PARSEENDBEHAVIOR_MovePoint)); } void @@ -1102,6 +1894,10 @@ syms_of_json (void) DEFSYM (Qjson_end_of_file, "json-end-of-file"); DEFSYM (Qjson_trailing_content, "json-trailing-content"); DEFSYM (Qjson_object_too_deep, "json-object-too-deep"); + DEFSYM (Qjson_utf8_decode_error, "json-utf8-decode-error") + DEFSYM (Qjson_invalid_surrogate_error, "json-invalid-surrogate-error") + DEFSYM (Qjson_number_out_of_range, "json-number-out-of-range-error") + DEFSYM (Qjson_escape_sequence_error, "json-escape-sequence-error") DEFSYM (Qjson_unavailable, "json-unavailable"); define_error (Qjson_error, "generic JSON error", Qerror); define_error (Qjson_out_of_memory, @@ -1113,6 +1909,14 @@ syms_of_json (void) Qjson_parse_error); define_error (Qjson_object_too_deep, "object cyclic or Lisp evaluation too deep", Qjson_error); + define_error (Qjson_utf8_decode_error, + "invalid utf-8 encoding", Qjson_error); + define_error (Qjson_invalid_surrogate_error, + "invalid surrogate pair", Qjson_error); + define_error (Qjson_number_out_of_range, + "number out of range", Qjson_error); + define_error (Qjson_escape_sequence_error, + "invalid escape sequence", Qjson_parse_error); DEFSYM (Qpure, "pure"); DEFSYM (Qside_effect_free, "side-effect-free");