From 70adc7b24faa90036343f77c9adf2568ce682882 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 08:13:25 -0700 Subject: [PATCH 01/16] fix bad type coercion (int64_t to double) --- inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp index e272e20..e92e05e 100644 --- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp +++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp @@ -231,6 +231,7 @@ inline constexpr auto Type_Doctor::common_R_type() co return rcpp_T::dbl; } if (i64_ && !(i32_ || lgl_ || u64_)) { + if (i64_ && !(lgl_ || u64_)) { // only 64/32-bit integers: will follow selected Int64_R_Type option return rcpp_T::i64; } From 47b770ce3226d34d9492a4ad0575eddd4ee43c66 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 08:20:59 -0700 Subject: [PATCH 02/16] move globals/macros to inst/include/RcppSimdJson/common.hpp and add documentation --- inst/include/RcppSimdJson.hpp | 62 --------- inst/include/RcppSimdJson/common.hpp | 121 ++++++++++++++++++ .../RcppSimdJson/deserialize/Type_Doctor.hpp | 13 +- .../RcppSimdJson/deserialize/scalar.hpp | 2 +- .../RcppSimdJson/deserialize/simplify.hpp | 20 +-- 5 files changed, 130 insertions(+), 88 deletions(-) create mode 100644 inst/include/RcppSimdJson/common.hpp diff --git a/inst/include/RcppSimdJson.hpp b/inst/include/RcppSimdJson.hpp index cef0285..1c0f8b8 100644 --- a/inst/include/RcppSimdJson.hpp +++ b/inst/include/RcppSimdJson.hpp @@ -1,68 +1,6 @@ #ifndef RCPPSIMDJSON_HPP #define RCPPSIMDJSON_HPP -#define STRICT_R_HEADERS -#include - - -namespace rcppsimdjson { - -static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN; - - -enum class rcpp_T : int { - array = 0, - object = 1, - chr = 2, - u64 = 3, - dbl = 4, - i64 = 5, - i32 = 6, - lgl = 7, - null = 8, -}; - - -template static inline constexpr auto na_val() { - if constexpr (R_Type == rcpp_T::chr) { - return NA_STRING; - } - if constexpr (R_Type == rcpp_T::dbl) { - return NA_REAL; - } - if constexpr (R_Type == rcpp_T::i64) { - return NA_INTEGER64; - } - if constexpr (R_Type == rcpp_T::i32) { - return NA_INTEGER; - } - if constexpr (R_Type == rcpp_T::lgl) { - return NA_LOGICAL; - } -} - - -// #define SIMDJSON_EXCEPTIONS 0 -#ifdef SIMDJSON_EXCEPTIONS -#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS -static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1; -#else -#define RCPPSIMDJSON_EXCEPTIONS 1 -static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; -#endif - - -static inline constexpr auto is_no_except(rcpp_T R_Type) -> bool { - // all scalars seem to be extractable w/o touching throwing code except for strings - return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr; -} - - -} // namespace rcppsimdjson - -#include - -#include "RcppSimdJson/utils.hpp" #include "RcppSimdJson/deserialize.hpp" #endif diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp new file mode 100644 index 0000000..44b88da --- /dev/null +++ b/inst/include/RcppSimdJson/common.hpp @@ -0,0 +1,121 @@ +#ifndef RCPPSIMDJSON_COMMON_HPP +#define RCPPSIMDJSON_COMMON_HPP + + +#define STRICT_R_HEADERS +#include + + +namespace rcppsimdjson { + +/* + * `bit64::integer64`-compatible `NA` + */ +static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN; + + +/* + * Typing arguments that decide how `simdjson::dom::element`s are ultimate return to R. + */ +enum class rcpp_T : int { + array = 0, /* recursive: individual elements will decide ultimate R type */ + object = 1, /* recursive: individual elements will decide ultimate R type */ + chr = 2, /* always becomes `Rcpp::String`/`character(1L)` */ + u64 = 3, /* always becomes `Rcpp::String`/`character(1L)` */ + dbl = 4, /* always becomes `double` */ + i64 = 5, /* follows Int64_R_Type: `double`, `character(1L)`, or `bit64::integer64` */ + i32 = 6, /* always becomes `int` */ + lgl = 7, /* always becomes `bool */ + null = 8, /* becomes `NA` if returned in a vector, else `NULL */ +}; + + +/* + * Generic, typed `NA` inserter. + */ +template static inline constexpr auto na_val() { + if constexpr (R_Type == rcpp_T::chr) { + return NA_STRING; + } + if constexpr (R_Type == rcpp_T::dbl) { + return NA_REAL; + } + if constexpr (R_Type == rcpp_T::i64) { + return NA_INTEGER64; + } + if constexpr (R_Type == rcpp_T::i32) { + return NA_INTEGER; + } + if constexpr (R_Type == rcpp_T::lgl) { + return NA_LOGICAL; + } +} + + +/* + * Internal flags tracking whether simdjson is compiled with exceptions enabled (the default). + * If simdjson is compiled w/o exceptions (`#define SIMDJSON_EXCEPTIONS 0`), operations that do not + * touch throwing code can be annotated with keyword `noexcept` where appropriate. + * See inst/include/RcppSimdJson/deserialize/scalar.hpp for examples. + */ +// #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */ +#ifdef SIMDJSON_EXCEPTIONS +#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS +static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1; +#else +#define RCPPSIMDJSON_EXCEPTIONS 1 // NOLINT(cppcoreguidelines-macro-usage) +static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; +#endif + + +/* + * All scalar-getter functions are annotated with `is_no_except()`, which will be false if + * `RCPPSIMDJSON_NO_EXCEPTIONS` is enabled and their `rcpp_T` template argument is not + * `rcpp_T::chr` (strings are not currently extractable w/o touching throwing code). + */ +static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool { + return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr; +} + + +namespace deserialize { + + +static inline constexpr bool HAS_NULLS = true; +static inline constexpr bool NO_NULLS = false; + +/* + * Determines level of type strictness in combining array elements into R vectors. + * + * When arrays are not homogeneous and `Type_Policy::anything_goes` is used, type promotion follows + * R's behavior. + */ +enum class Type_Policy : int { + anything_goes = 0, /* Non-recursive arrays always become vectors */ + ints_as_dbls = 1, /* Combines `rcpp_T::i32`s, `::i64`s, and `::dbl`s */ + strict = 2, /* No type promotions */ +}; + + +/* + * Maximum simplification level. `Simplify_To::list` results in no simplification. + */ +enum class Simplify_To : int { + data_frame = 0, + matrix = 1, + vector = 2, + list = 3, +}; + + +} // namespace deserialize + + +} // namespace rcppsimdjson + + +#include +#include "RcppSimdJson/utils.hpp" + + +#endif \ No newline at end of file diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp index e92e05e..c692953 100644 --- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp +++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp @@ -1,16 +1,18 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP #define RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP +#include "../common.hpp" + namespace rcppsimdjson { namespace deserialize { -enum class Type_Policy : int { - anything_goes = 0, - ints_as_dbls = 1, - strict = 2, -}; +// enum class Type_Policy : int { +// anything_goes = 0, +// ints_as_dbls = 1, +// strict = 2, +// }; template class Type_Doctor { @@ -230,7 +232,6 @@ inline constexpr auto Type_Doctor::common_R_type() co if (dbl_ && !(lgl_ || u64_)) { // any number will become double return rcpp_T::dbl; } - if (i64_ && !(i32_ || lgl_ || u64_)) { if (i64_ && !(lgl_ || u64_)) { // only 64/32-bit integers: will follow selected Int64_R_Type option return rcpp_T::i64; diff --git a/inst/include/RcppSimdJson/deserialize/scalar.hpp b/inst/include/RcppSimdJson/deserialize/scalar.hpp index 258478b..78acfdf 100644 --- a/inst/include/RcppSimdJson/deserialize/scalar.hpp +++ b/inst/include/RcppSimdJson/deserialize/scalar.hpp @@ -63,7 +63,7 @@ inline auto get_scalar_(simdjson::dom::element element) no template <> inline auto get_scalar_(simdjson::dom::element element) noexcept( is_no_except(rcpp_T::dbl)) { - return static_cast(element.get().first); + return element.get().first; } // return int64_t template <> diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 096ef61..8e5a504 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -2,35 +2,17 @@ #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP -namespace rcppsimdjson { -namespace deserialize { - - -static inline constexpr bool HAS_NULLS = true; -static inline constexpr bool NO_NULLS = false; - -} // namespace deserialize -} // namespace rcppsimdjson - - #include "Type_Doctor.hpp" #include "scalar.hpp" #include "vector.hpp" #include "matrix.hpp" #include "dataframe.hpp" + namespace rcppsimdjson { namespace deserialize { -enum class Simplify_To : int { - data_frame = 0, - matrix = 1, - vector = 2, - list = 3, -}; - - // forward declaration template inline auto simplify_element(const simdjson::dom::element, const SEXP, const SEXP) -> SEXP; From 891df4a8b7acaeed94e9b848bb7bfe86649b9ee7 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 08:55:37 -0700 Subject: [PATCH 03/16] move forward-declaration for simplify_element() to inst/include/RcppSimdJson/common.hpp so build_data_frame() can go where it should have been (inst/include/RcppSimdJson/deserialize/dataframe.hpp) --- inst/include/RcppSimdJson/common.hpp | 14 ++++ .../RcppSimdJson/deserialize/dataframe.hpp | 74 +++++++++++++++++ .../RcppSimdJson/deserialize/simplify.hpp | 81 +------------------ 3 files changed, 90 insertions(+), 79 deletions(-) diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp index 44b88da..71a9d86 100644 --- a/inst/include/RcppSimdJson/common.hpp +++ b/inst/include/RcppSimdJson/common.hpp @@ -118,4 +118,18 @@ enum class Simplify_To : int { #include "RcppSimdJson/utils.hpp" +namespace rcppsimdjson { +namespace deserialize { + + +// forward declaration: definition in inst/include/RcppSimdJson/deserialize/simplify.hpp +template +inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object) + -> SEXP; + + +} // namespace deserialize +} // namespace rcppsimdjson + + #endif \ No newline at end of file diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp index 0a8a0e2..9cd01e1 100644 --- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp +++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp @@ -159,6 +159,80 @@ inline auto build_col_integer64(const simdjson::dom::array array, } + +template +inline auto build_data_frame(const simdjson::dom::array array, + const std::map>& cols, + SEXP empty_array, + SEXP empty_object) -> SEXP { + + const auto n_rows = R_xlen_t(std::size(array)); + auto out = Rcpp::List(std::size(cols)); + auto out_names = Rcpp::CharacterVector(std::size(cols)); + + for (auto [key, col] : cols) { + out_names[col.index] = std::string(key); + + switch (col.schema.common_R_type()) { + case rcpp_T::chr: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::dbl: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::i64: { + out[col.index] = build_col_integer64(array, key, col.schema); + break; + } + + case rcpp_T::i32: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::lgl: { + out[col.index] = build_col(array, key, col.schema); + break; + } + + case rcpp_T::null: { + out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL); + break; + } + + default: { + auto this_col = Rcpp::Vector(n_rows); + auto i_row = R_xlen_t(0); + for (auto element : array) { + auto [value, error] = element.get().at_key(key); + if (error) { + this_col[i_row++] = NA_LOGICAL; + } else { + this_col[i_row++] = simplify_element( + value, empty_array, empty_object // + ); + } + } + out[col.index] = this_col; + } + } + } + + out.attr("names") = out_names; + out.attr("row.names") = Rcpp::seq(1, n_rows); + out.attr("class") = "data.frame"; + + return out; +} + + } // namespace deserialize } // namespace rcppsimdjson diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 8e5a504..2614895 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -1,7 +1,7 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP - +#include "../common.hpp" #include "Type_Doctor.hpp" #include "scalar.hpp" #include "vector.hpp" @@ -13,84 +13,6 @@ namespace rcppsimdjson { namespace deserialize { -// forward declaration -template -inline auto simplify_element(const simdjson::dom::element, const SEXP, const SEXP) -> SEXP; - - -template -inline auto build_data_frame(const simdjson::dom::array array, - const std::map>& cols, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { - - const auto n_rows = R_xlen_t(std::size(array)); - auto out = Rcpp::List(std::size(cols)); - auto out_names = Rcpp::CharacterVector(std::size(cols)); - - for (auto [key, col] : cols) { - out_names[col.index] = std::string(key); - - switch (col.schema.common_R_type()) { - case rcpp_T::chr: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::dbl: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::i64: { - out[col.index] = build_col_integer64(array, key, col.schema); - break; - } - - case rcpp_T::i32: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::lgl: { - out[col.index] = build_col(array, key, col.schema); - break; - } - - case rcpp_T::null: { - out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL); - break; - } - - default: { - auto this_col = Rcpp::Vector(n_rows); - auto i_row = R_xlen_t(0); - for (auto element : array) { - auto [value, error] = element.get().at_key(key); - if (error) { - this_col[i_row++] = NA_LOGICAL; - } else { - this_col[i_row++] = simplify_element( - value, empty_array, empty_object // - ); - } - } - out[col.index] = this_col; - } - } - } - - out.attr("names") = out_names; - out.attr("row.names") = Rcpp::seq(1, n_rows); - out.attr("class") = "data.frame"; - - return out; -} - - template inline auto simplify_list(const simdjson::dom::array array, const SEXP empty_array, @@ -234,6 +156,7 @@ inline auto simplify_object(const simdjson::dom::object object, } +// definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp template inline auto simplify_element(const simdjson::dom::element element, const SEXP empty_array, From 89d5b34d049ba8f5c28c7cd23fb5bf6de31b689a Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 15:38:32 -0700 Subject: [PATCH 04/16] make all integer types explicit --- .../RcppSimdJson/deserialize/dataframe.hpp | 11 +++++----- .../RcppSimdJson/deserialize/matrix.hpp | 22 ++++++++++--------- .../RcppSimdJson/deserialize/simplify.hpp | 6 ++--- .../RcppSimdJson/deserialize/vector.hpp | 12 +++++----- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp index 9cd01e1..9f897e5 100644 --- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp +++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp @@ -8,7 +8,7 @@ namespace rcppsimdjson { namespace deserialize { template struct Column { - R_xlen_t index = 0; + R_xlen_t index = 0L; Type_Doctor schema = Type_Doctor(); }; @@ -24,7 +24,7 @@ diagnose_data_frame(const simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_E -> std::optional> { auto cols = Column_Schema(); - auto col_index = 0; + auto col_index = R_xlen_t(0L); if (std::size(array) == 0) { return std::nullopt; @@ -59,7 +59,7 @@ inline auto build_col(const simdjson::dom::array array, const Type_Doctor& type_doc) -> Rcpp::Vector { auto out = Rcpp::Vector(std::size(array), na_val()); - auto i_row = R_xlen_t(0); + auto i_row = R_xlen_t(0L); if (type_doc.is_homogeneous()) { if (type_doc.has_null()) { @@ -112,7 +112,7 @@ inline auto build_col_integer64(const simdjson::dom::array array, if constexpr (int64_opt == utils::Int64_R_Type::Integer64) { auto stl_vec = std::vector(std::size(array), NA_INTEGER64); - auto i_row = std::size_t(0); + auto i_row = std::size_t(0ULL); if (type_doc.is_homogeneous()) { if (type_doc.has_null()) { @@ -159,7 +159,6 @@ inline auto build_col_integer64(const simdjson::dom::array array, } - template inline auto build_data_frame(const simdjson::dom::array array, const std::map>& cols, @@ -209,7 +208,7 @@ inline auto build_data_frame(const simdjson::dom::array array, default: { auto this_col = Rcpp::Vector(n_rows); - auto i_row = R_xlen_t(0); + auto i_row = R_xlen_t(0L); for (auto element : array) { auto [value, error] = element.get().at_key(key); if (error) { diff --git a/inst/include/RcppSimdJson/deserialize/matrix.hpp b/inst/include/RcppSimdJson/deserialize/matrix.hpp index 074ca34..a08995c 100644 --- a/inst/include/RcppSimdJson/deserialize/matrix.hpp +++ b/inst/include/RcppSimdJson/deserialize/matrix.hpp @@ -47,11 +47,11 @@ inline auto diagnose(simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_EXCEPT template -inline auto build_matrix_typed(simdjson::dom::array array, const std::size_t n_cols) +inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols) -> Rcpp::Vector { - const auto n_rows = std::size(array); - auto out = Rcpp::Matrix(n_rows, n_cols); + const auto n_rows = r_length(array); + auto out = Rcpp::Matrix(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); #if RCPPSIMDJSON_EXCEPTIONS @@ -106,7 +106,8 @@ inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std:: #endif auto out = Rcpp::NumericVector(utils::as_integer64(stl_vec_int64)); - out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols); + out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // + static_cast(n_cols)); return out; } @@ -159,7 +160,7 @@ inline auto dispatch_typed(const simdjson::dom::array array, : build_matrix_typed(array, n_cols); case simdjson::dom::element_type::NULL_VALUE: - return Rcpp::LogicalVector(std::size(array), NA_LOGICAL); + return Rcpp::LogicalVector(r_length(array), NA_LOGICAL); case simdjson::dom::element_type::UINT64: return has_nulls ? build_matrix_typed(array, n_cols) @@ -173,8 +174,8 @@ inline auto dispatch_typed(const simdjson::dom::array array, template inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size_t n_cols) -> SEXP { - const auto n_rows = std::size(array); - Rcpp::Matrix out(n_rows, n_cols); + const auto n_rows = r_length(array); + Rcpp::Matrix out(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); @@ -207,7 +208,7 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: const auto n_rows = std::size(array); auto stl_vec_int64 = std::vector(n_rows * n_cols); - auto j = std::size_t(0); + auto j = std::size_t(0ULL); #if RCPPSIMDJSON_EXCEPTIONS for (simdjson::dom::array sub_array : array) { @@ -253,7 +254,8 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: auto out = Rcpp::Vector(utils::as_integer64(stl_vec_int64)); - out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols); + out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // + static_cast(n_cols)); return out; } @@ -294,7 +296,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array, return build_matrix_mixed(array, n_cols); default: { - auto out = Rcpp::LogicalMatrix(std::size(array), n_cols); + auto out = Rcpp::LogicalMatrix(r_length(array), static_cast(n_cols)); out.fill(NA_LOGICAL); return out; } diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 2614895..ebcc4b2 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -17,7 +17,7 @@ template SEXP { - Rcpp::List out(std::size(array)); + Rcpp::List out(r_length(array)); auto i = R_xlen_t(0); for (auto element : array) { @@ -136,7 +136,7 @@ template SEXP { - const auto n = R_xlen_t(std::size(object)); + const auto n = r_length(object); if (n == 0) { return empty_object; } @@ -144,7 +144,7 @@ inline auto simplify_object(const simdjson::dom::object object, Rcpp::List out(n); Rcpp::CharacterVector out_names(n); - auto i = R_xlen_t(0); + auto i = R_xlen_t(0L); for (auto [key, value] : object) { out[i] = simplify_element(value, empty_array, empty_object); diff --git a/inst/include/RcppSimdJson/deserialize/vector.hpp b/inst/include/RcppSimdJson/deserialize/vector.hpp index e747af9..7771044 100644 --- a/inst/include/RcppSimdJson/deserialize/vector.hpp +++ b/inst/include/RcppSimdJson/deserialize/vector.hpp @@ -9,7 +9,7 @@ namespace vector { template inline auto build_vector_typed(const simdjson::dom::array array) -> Rcpp::Vector { - auto out = Rcpp::Vector(std::size(array)); + auto out = Rcpp::Vector(r_length(array)); auto i = R_xlen_t(0); for (auto element : array) { out[i++] = get_scalar(element); @@ -23,7 +23,7 @@ inline auto build_vector_integer64_typed(const simdjson::dom::array array) -> Rcpp::Vector { auto stl_vec_int64 = std::vector(std::size(array)); - auto i = std::size_t(0); + auto i = std::size_t(0ULL); for (auto element : array) { stl_vec_int64[i++] = get_scalar(element); } @@ -82,8 +82,8 @@ inline auto dispatch_typed(const simdjson::dom::array array, template inline auto build_vector_mixed(const simdjson::dom::array array) -> Rcpp::Vector { - auto out = Rcpp::Vector(std::size(array)); - auto i = R_xlen_t(0); + auto out = Rcpp::Vector(r_length(array)); + auto i = R_xlen_t(0L); for (auto element : array) { out[i++] = get_scalar_dispatch(element); } @@ -96,7 +96,7 @@ inline auto build_vector_integer64_mixed(const simdjson::dom::array array) -> Rcpp::Vector { auto stl_vec_int64 = std::vector(std::size(array)); - auto i = std::size_t(0); + auto i = std::size_t(0ULL); for (auto element : array) { switch (element.type()) { case simdjson::dom::element_type::INT64: @@ -151,7 +151,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array, const rcpp_T common return build_vector_mixed(array); default: - return Rcpp::LogicalVector(std::size(array), NA_LOGICAL); + return Rcpp::LogicalVector(r_length(array), NA_LOGICAL); } } From 7b518d05b690e1cc38d2909179eafc006d56ce9e Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 15:40:49 -0700 Subject: [PATCH 05/16] remove template specification for Rcpp::wrap() --- inst/include/RcppSimdJson/deserialize/simplify.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index ebcc4b2..8dcca0c 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -178,13 +178,13 @@ inline auto simplify_element(const simdjson::dom::element element, ); case simdjson::dom::element_type::DOUBLE: - return Rcpp::wrap(element.get().first); + return Rcpp::wrap(element.get().first); case simdjson::dom::element_type::INT64: return utils::resolve_int64(element.get().first); case simdjson::dom::element_type::BOOL: - return Rcpp::wrap(element.get().first); + return Rcpp::wrap(element.get().first); case simdjson::dom::element_type::STRING: return Rcpp::wrap(element.get().first); From dbf9ca4bd1a5d635a3a0293e86c691d7df68a5c5 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 15:43:31 -0700 Subject: [PATCH 06/16] remove const qualifiers on empty_array/object --- .../RcppSimdJson/deserialize/simplify.hpp | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 8dcca0c..4458e01 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -14,9 +14,8 @@ namespace deserialize { template -inline auto simplify_list(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_list(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { Rcpp::List out(r_length(array)); auto i = R_xlen_t(0); @@ -33,9 +32,8 @@ inline auto simplify_list(const simdjson::dom::array array, template -inline auto simplify_vector(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_vector(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { const auto type_doctor = Type_Doctor(array); if (type_doctor.is_vectorizable()) { @@ -53,9 +51,8 @@ inline auto simplify_vector(const simdjson::dom::array array, template -inline auto simplify_matrix(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_matrix(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { if (const auto matrix = matrix::diagnose(array)) { return matrix->is_homogeneous ? matrix::dispatch_typed( // @@ -73,9 +70,8 @@ inline auto simplify_matrix(const simdjson::dom::array array, template -inline auto simplify_data_frame(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto +simplify_data_frame(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) -> SEXP { if (const auto cols = diagnose_data_frame(array)) { return build_data_frame( // array, // @@ -91,8 +87,8 @@ inline auto simplify_data_frame(const simdjson::dom::array array, template inline auto dispatch_simplify_array(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { + SEXP empty_array, + SEXP empty_object) -> SEXP { if (std::size(array) == 0) { return empty_array; @@ -133,9 +129,8 @@ inline auto dispatch_simplify_array(const simdjson::dom::array array, template -inline auto simplify_object(const simdjson::dom::object object, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_object(const simdjson::dom::object object, SEXP empty_array, SEXP empty_object) + -> SEXP { const auto n = r_length(object); if (n == 0) { return empty_object; @@ -159,8 +154,8 @@ inline auto simplify_object(const simdjson::dom::object object, // definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp template inline auto simplify_element(const simdjson::dom::element element, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { + SEXP empty_array, + SEXP empty_object) -> SEXP { switch (element.type()) { case simdjson::dom::element_type::ARRAY: From 918e4f9a78c413e9b00079eb1312bdc3d292fc1d Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 15:46:06 -0700 Subject: [PATCH 07/16] clean dead comments, fix formatting --- inst/include/RcppSimdJson/deserialize.hpp | 3 + .../RcppSimdJson/deserialize/Type_Doctor.hpp | 12 +--- .../RcppSimdJson/deserialize/dataframe.hpp | 1 + .../RcppSimdJson/deserialize/matrix.hpp | 62 +------------------ .../RcppSimdJson/deserialize/simplify.hpp | 1 + 5 files changed, 11 insertions(+), 68 deletions(-) diff --git a/inst/include/RcppSimdJson/deserialize.hpp b/inst/include/RcppSimdJson/deserialize.hpp index edcd970..3e7b204 100644 --- a/inst/include/RcppSimdJson/deserialize.hpp +++ b/inst/include/RcppSimdJson/deserialize.hpp @@ -1,8 +1,10 @@ #ifndef RCPPSIMDJSON__DESERIALIZE_HPP #define RCPPSIMDJSON__DESERIALIZE_HPP + #include "deserialize/simplify.hpp" + namespace rcppsimdjson { namespace deserialize { @@ -248,4 +250,5 @@ inline auto deserialize(const simdjson::dom::element parsed, } // namespace deserialize } // namespace rcppsimdjson + #endif diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp index c692953..f3909d5 100644 --- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp +++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp @@ -8,13 +8,6 @@ namespace rcppsimdjson { namespace deserialize { -// enum class Type_Policy : int { -// anything_goes = 0, -// ints_as_dbls = 1, -// strict = 2, -// }; - - template class Type_Doctor { bool ARRAY_ = false; bool array_ = false; @@ -41,6 +34,7 @@ template class Type_Doctor { bool UINT64_ = false; bool u64_ = false; + public: Type_Doctor() = default; explicit Type_Doctor(simdjson::dom::array) noexcept; @@ -99,7 +93,7 @@ template class Type_Doctor { auto add_element(simdjson::dom::element) noexcept -> void; - constexpr auto update(Type_Doctor&& type_doctor) noexcept -> void; + constexpr auto update(Type_Doctor&&) noexcept -> void; }; @@ -327,7 +321,7 @@ template inline constexpr auto Type_Doctor::common_element_type() const noexcept -> simdjson::dom::element_type { - using namespace simdjson::dom; + using simdjson::dom::element_type; return ARRAY_ ? element_type::ARRAY : OBJECT_ ? element_type::OBJECT diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp index 9f897e5..0efa7c4 100644 --- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp +++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp @@ -1,6 +1,7 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP #define RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP + #include "matrix.hpp" diff --git a/inst/include/RcppSimdJson/deserialize/matrix.hpp b/inst/include/RcppSimdJson/deserialize/matrix.hpp index a08995c..edd173f 100644 --- a/inst/include/RcppSimdJson/deserialize/matrix.hpp +++ b/inst/include/RcppSimdJson/deserialize/matrix.hpp @@ -54,16 +54,6 @@ inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols) auto out = Rcpp::Matrix(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = R_xlen_t(0); - for (auto element : sub_array) { - out[i + j] = get_scalar(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = R_xlen_t(0); for (auto element : sub_array.get().first) { @@ -72,29 +62,18 @@ inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols) } j++; } -#endif return out; } template -inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std::size_t n_cols) +inline auto build_matrix_integer64_typed(simdjson::dom::array array, std::size_t n_cols) -> Rcpp::Vector { const auto n_rows = std::size(array); auto stl_vec_int64 = std::vector(n_rows * n_cols); auto j = std::size_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = std::size_t(0); - for (auto element : sub_array) { - stl_vec_int64[i + j] = get_scalar(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = std::size_t(0); for (auto element : sub_array.get().first) { @@ -103,7 +82,6 @@ inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std:: } j++; } -#endif auto out = Rcpp::NumericVector(utils::as_integer64(stl_vec_int64)); out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // @@ -169,26 +147,16 @@ inline auto dispatch_typed(const simdjson::dom::array array, default: return R_NilValue; } -} // namespace deserialize +} template -inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size_t n_cols) -> SEXP { +inline auto build_matrix_mixed(simdjson::dom::array array, std::size_t n_cols) -> SEXP { const auto n_rows = r_length(array); Rcpp::Matrix out(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = R_xlen_t(0); - for (auto element : sub_array) { - out[i + j] = get_scalar_dispatch(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = R_xlen_t(0); for (auto element : sub_array.get().first) { @@ -197,7 +165,6 @@ inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size } j++; } -#endif return out; } @@ -210,27 +177,6 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: auto stl_vec_int64 = std::vector(n_rows * n_cols); auto j = std::size_t(0ULL); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - std::size_t i = 0; - for (auto element : sub_array) { - switch (element.type()) { - case simdjson::dom::element_type::INT64: - stl_vec_int64[i + j] = get_scalar(element); - break; - - case simdjson::dom::element_type::BOOL: - stl_vec_int64[i + j] = get_scalar(element); - break; - - default: - stl_vec_int64[i + j] = NA_INTEGER64; - } - i += n_rows; - } - j++; - } -#else for (auto element : array) { std::size_t i = 0; for (auto sub_element : element.get().first) { @@ -250,8 +196,6 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: } j++; } -#endif - auto out = Rcpp::Vector(utils::as_integer64(stl_vec_int64)); out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 4458e01..e69eb23 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -1,6 +1,7 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP + #include "../common.hpp" #include "Type_Doctor.hpp" #include "scalar.hpp" From 346d5840aeb06cc2e655984f3c0a1867520300a6 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 15:46:36 -0700 Subject: [PATCH 08/16] add more documentation --- inst/include/RcppSimdJson/common.hpp | 98 +++++++++++-------- inst/include/RcppSimdJson/deserialize.hpp | 27 ++++- .../RcppSimdJson/deserialize/scalar.hpp | 10 ++ .../RcppSimdJson/deserialize/simplify.hpp | 25 ++++- 4 files changed, 115 insertions(+), 45 deletions(-) diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp index 71a9d86..a882d7b 100644 --- a/inst/include/RcppSimdJson/common.hpp +++ b/inst/include/RcppSimdJson/common.hpp @@ -8,30 +8,39 @@ namespace rcppsimdjson { -/* - * `bit64::integer64`-compatible `NA` +/** + * @brief A container's size as an @c R_xlen_t @c. Otherwise Equivalent to @c std::size() @c. + */ +template +inline constexpr auto r_length(const _Container& __cont) noexcept -> R_xlen_t { + return static_cast(std::size(__cont)); +} + + +/** + * @brief A @c bit64::integer64 @c-compatible @c NA @c. */ static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN; -/* - * Typing arguments that decide how `simdjson::dom::element`s are ultimate return to R. +/** + * @brief Typing arguments that decide how a @c simdjson::dom::element is ultimately returned to R. */ enum class rcpp_T : int { - array = 0, /* recursive: individual elements will decide ultimate R type */ - object = 1, /* recursive: individual elements will decide ultimate R type */ - chr = 2, /* always becomes `Rcpp::String`/`character(1L)` */ - u64 = 3, /* always becomes `Rcpp::String`/`character(1L)` */ - dbl = 4, /* always becomes `double` */ - i64 = 5, /* follows Int64_R_Type: `double`, `character(1L)`, or `bit64::integer64` */ - i32 = 6, /* always becomes `int` */ - lgl = 7, /* always becomes `bool */ - null = 8, /* becomes `NA` if returned in a vector, else `NULL */ + array = 0, /**< recursive: individual elements will decide ultimate R type */ + object = 1, /**< recursive: individual elements will decide ultimate R type */ + chr = 2, /**< always becomes @c Rcpp::String / @c character */ + u64 = 3, /**< always becomes @c Rcpp::String / @c character */ + dbl = 4, /**< always becomes @c double */ + i64 = 5, /**< follows @c Int64_R_Type: @c double, @c character, or @c bit64::integer64 */ + i32 = 6, /**< always becomes @c int */ + lgl = 7, /**< always becomes @c bool / @c logical */ + null = 8, /**< becomes @c NA if returned in a vector, else @c NULL */ }; -/* - * Generic, typed `NA` inserter. +/** + * @brief Get a typed @c NA @c. */ template static inline constexpr auto na_val() { if constexpr (R_Type == rcpp_T::chr) { @@ -52,28 +61,34 @@ template static inline constexpr auto na_val() { } -/* +/** * Internal flags tracking whether simdjson is compiled with exceptions enabled (the default). - * If simdjson is compiled w/o exceptions (`#define SIMDJSON_EXCEPTIONS 0`), operations that do not - * touch throwing code can be annotated with keyword `noexcept` where appropriate. - * See inst/include/RcppSimdJson/deserialize/scalar.hpp for examples. + * If simdjson is compiled w/o exceptions ( @c #define SIMDJSON_EXCEPTIONS 0 @c), operations that + * do not touch throwing code can be annotated with keyword @c noexcept where appropriate. */ // #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */ #ifdef SIMDJSON_EXCEPTIONS #define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1; #else -#define RCPPSIMDJSON_EXCEPTIONS 1 // NOLINT(cppcoreguidelines-macro-usage) +#define RCPPSIMDJSON_EXCEPTIONS 1 static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; #endif -/* - * All scalar-getter functions are annotated with `is_no_except()`, which will be false if - * `RCPPSIMDJSON_NO_EXCEPTIONS` is enabled and their `rcpp_T` template argument is not - * `rcpp_T::chr` (strings are not currently extractable w/o touching throwing code). +/** + * @brief Whether a function is @code{noexcept}. + * + * If a function does not touch throwing code it can be annotated as @c noexcept(). + * If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not + * @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept + * when compiled. Currently, @c rccp_T::chr touches throwing code so functions using it will always + * be @c noexcept(false). + * + * Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}. */ -static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool { +static inline constexpr auto is_no_except(rcpp_T R_Type) // NOLINT(clang-diagnostic-unused-function) + -> bool { return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr; } @@ -81,30 +96,27 @@ static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool { namespace deserialize { -static inline constexpr bool HAS_NULLS = true; -static inline constexpr bool NO_NULLS = false; - -/* - * Determines level of type strictness in combining array elements into R vectors. +/** + * @brief Determines level of type strictness in combining array elements into R vectors. * - * When arrays are not homogeneous and `Type_Policy::anything_goes` is used, type promotion follows + * When arrays are not homogeneous and @c Type_Policy::anything_goes is used, type promotion follows * R's behavior. */ enum class Type_Policy : int { - anything_goes = 0, /* Non-recursive arrays always become vectors */ - ints_as_dbls = 1, /* Combines `rcpp_T::i32`s, `::i64`s, and `::dbl`s */ - strict = 2, /* No type promotions */ + anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */ + ints_as_dbls = 1, /* Non-recursive arrays of only numbers are promoted to highest type */ + strict = 2, /* No type promotion */ }; -/* - * Maximum simplification level. `Simplify_To::list` results in no simplification. +/** + * @brief Maximum simplification level. */ enum class Simplify_To : int { - data_frame = 0, - matrix = 1, - vector = 2, - list = 3, + data_frame = 0, /* If possible, return dataframes. Otherwise return matrices/vectors/lists. */ + matrix = 1, /* If possible, return matrices. Otherwise return vectors/lists. */ + vector = 2, /* If possible, return vectors. Otherwise return lists. */ + list = 3, /* No simplification. */ }; @@ -122,7 +134,11 @@ namespace rcppsimdjson { namespace deserialize { -// forward declaration: definition in inst/include/RcppSimdJson/deserialize/simplify.hpp +/** + * @brief Simplify a @c simdjson::dom::element to an R object. + * + * @note Forward declaration. See @file inst/include/RcppSimdJson/deserialize/simplify.hpp @file. + */ template inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object) -> SEXP; diff --git a/inst/include/RcppSimdJson/deserialize.hpp b/inst/include/RcppSimdJson/deserialize.hpp index 3e7b204..c990bf9 100644 --- a/inst/include/RcppSimdJson/deserialize.hpp +++ b/inst/include/RcppSimdJson/deserialize.hpp @@ -8,15 +8,36 @@ namespace rcppsimdjson { namespace deserialize { -// THE GREAT DISPATCHER + +/** + * @brief Deserialize a parsed @c simdjson::dom::element to R objects. + * + * + * @param element @c simdjson::dom::element to deserialize. + * + * @param empty_array R object to return when encountering an empty JSON array. + * + * @param empty_object R object to return when encountering an empty JSON object. + * + * @param type_policy @c Type_Policy specifying type strictness in combining mixed-type array + * elements into R vectors. + * + * @param int64_opt @c Int64_R_Type specifying how big integers are returned to R. + * + * @param simplify_to @c Simplify_To specifying the maximum level of simplification. + * + * + * @return The simplified R object ( @c SEXP ). + */ inline auto deserialize(const simdjson::dom::element parsed, - const SEXP empty_array, - const SEXP empty_object, + SEXP empty_array, + SEXP empty_object, const Simplify_To simplify_to, const Type_Policy type_policy, const utils::Int64_R_Type int64_opt) -> SEXP { using Int64_R_Type = utils::Int64_R_Type; + // THE GREAT DISPATCHER switch (type_policy) { case Type_Policy::anything_goes: { switch (int64_opt) { diff --git a/inst/include/RcppSimdJson/deserialize/scalar.hpp b/inst/include/RcppSimdJson/deserialize/scalar.hpp index 78acfdf..d76c04e 100644 --- a/inst/include/RcppSimdJson/deserialize/scalar.hpp +++ b/inst/include/RcppSimdJson/deserialize/scalar.hpp @@ -7,6 +7,16 @@ namespace rcppsimdjson { namespace deserialize { +/* + * Check for `null`s and return the appropriate `NA`s when found. + */ +static inline constexpr bool HAS_NULLS = true; +/* + * No `null`s present, so skip checking for them. + */ +static inline constexpr bool NO_NULLS = false; + + template inline auto get_scalar_(simdjson::dom::element) noexcept(is_no_except(R_Type)); diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index e69eb23..3107c64 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -152,7 +152,30 @@ inline auto simplify_object(const simdjson::dom::object object, SEXP empty_array } -// definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp +/** + * @brief Simplify a @c simdjson::dom::element to an R object. + * + * + * @tparam type_policy The @c Type_Policy specifying type strictness in combining mixed-type array + * elements into R vectors. + * + * @tparam int64_opt The @c Int64_R_Type specifying how big integers are returned to R. + * + * @tparam simplify_to The @c Simplify_To specifying the maximum level of simplification. + * + * + * @param element @c simdjson::dom::element to simplify. + * + * @param empty_array R object to return when encountering an empty JSON array. + * + * @param empty_object R object to return when encountering an empty JSON object. + * + * + * @return The simplified R object ( @c SEXP ). + * + * + * @note definition: forward declaration in @file inst/include/RcppSimdJson/common.hpp @file. + */ template inline auto simplify_element(const simdjson::dom::element element, SEXP empty_array, From 52a0f72584f3a69ba6f296f4031236046190e2b9 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 18:43:06 -0700 Subject: [PATCH 09/16] small documentation fixes --- inst/include/RcppSimdJson/common.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp index a882d7b..1261d36 100644 --- a/inst/include/RcppSimdJson/common.hpp +++ b/inst/include/RcppSimdJson/common.hpp @@ -77,13 +77,15 @@ static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; /** - * @brief Whether a function is @code{noexcept}. + * @brief Whether a function is @c noexcept. * - * If a function does not touch throwing code it can be annotated as @c noexcept(). + * If a function does not touch throwing code it can be annotated with @c noexcept(). * If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not * @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept - * when compiled. Currently, @c rccp_T::chr touches throwing code so functions using it will always - * be @c noexcept(false). + * when compiled. + * + * Currently, @c rccp_T::chr touches throwing code so functions using it will always be + * @c noexcept(false). * * Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}. */ @@ -104,7 +106,7 @@ namespace deserialize { */ enum class Type_Policy : int { anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */ - ints_as_dbls = 1, /* Non-recursive arrays of only numbers are promoted to highest type */ + ints_as_dbls = 1, /* Non-recursive arrays of only numbers are promoted to highest type */ strict = 2, /* No type promotion */ }; @@ -121,8 +123,6 @@ enum class Simplify_To : int { } // namespace deserialize - - } // namespace rcppsimdjson From f4d73e2daaac7c8c272b1ad8babc547c03b71bd0 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 18:45:48 -0700 Subject: [PATCH 10/16] fix .deserialize_json() when exceptions are disabled; add .load_json() file reader --- src/deserialize.cpp | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 6d81721..8d10fe0 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -1,5 +1,6 @@ #include + //' Deserialize JSON into R Objects //' //' @param json \code{character(1L)} @@ -33,18 +34,42 @@ SEXP deserialize_json(const Rcpp::String& json, simdjson::dom::parser parser; -#if RCPPSIMDJSON_EXCEPTIONS - simdjson::dom::element parsed = json_pointer.empty() // - ? parser.parse(json) - : parser.parse(json).at(json_pointer); -#else auto [parsed, error] = json_pointer.empty() // - ? parser.parse(json).first + ? parser.parse(json) : parser.parse(json).at(json_pointer); + + if (error) { + Rcpp::stop(simdjson::error_message(error)); + } + + return deserialize::deserialize(parsed, + empty_array, + empty_object, + static_cast(simplify_to), + static_cast(type_policy), + static_cast(int64_r_type)); +} + + +// [[Rcpp::export(.load_json)]] +SEXP load_json(const std::string& file_path, + const std::string& json_pointer = "", + SEXP empty_array = R_NilValue, + SEXP empty_object = R_NilValue, + const int simplify_to = 0, + const int type_policy = 0, + const int int64_r_type = 0) { + using namespace rcppsimdjson; + + simdjson::dom::parser parser; + + auto [parsed, error] = json_pointer.empty() // + ? parser.load(file_path) + : parser.load(file_path).at(json_pointer); + if (error) { Rcpp::stop(simdjson::error_message(error)); } -#endif return deserialize::deserialize(parsed, empty_array, From 50982cb1314a1d80f4d6dda17fbf77e42ff559af Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 18:48:04 -0700 Subject: [PATCH 11/16] sync with upstream simdjson (12 Jun 2020) --- inst/include/simdjson.cpp | 5855 ++++++++++++++++++++----------------- inst/include/simdjson.h | 829 +++--- 2 files changed, 3581 insertions(+), 3103 deletions(-) diff --git a/inst/include/simdjson.cpp b/inst/include/simdjson.cpp index a2d815f..d99dc8b 100644 --- a/inst/include/simdjson.cpp +++ b/inst/include/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */ +/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */ /* begin file src/simdjson.cpp */ #include "simdjson.h" @@ -12,7 +12,6 @@ namespace internal { SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] { { SUCCESS, "No error" }, - { SUCCESS_AND_HAS_MORE, "No error and buffer still has more data" }, { CAPACITY, "This parser can't support a document that big" }, { MEMALLOC, "Error allocating memory, we're most likely out of memory" }, { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." }, @@ -359,8 +358,6 @@ static const uint64_t thintable_epi8[256] = { namespace simdjson { namespace haswell { -using namespace simdjson::dom; - class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation( @@ -368,11 +365,12 @@ class implementation final : public simdjson::implementation { "Intel/AMD AVX2", instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2 ) {} - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace haswell @@ -398,11 +396,12 @@ using namespace simdjson::dom; class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {} - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace westmere @@ -428,11 +427,12 @@ using namespace simdjson::dom; class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {} - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace arm64 @@ -462,11 +462,12 @@ class implementation final : public simdjson::implementation { "Generic fallback implementation", 0 ) {} - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace fallback @@ -489,21 +490,16 @@ class detect_best_supported_implementation_on_first_use final : public implement const std::string &name() const noexcept final { return set_best()->name(); } const std::string &description() const noexcept final { return set_best()->description(); } uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final { - return set_best()->parse(buf, len, parser); + WARN_UNUSED error_code create_dom_parser_implementation( + size_t capacity, + size_t max_length, + std::unique_ptr& dst + ) const noexcept final { + return set_best()->create_dom_parser_implementation(capacity, max_length, dst); } WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { return set_best()->minify(buf, len, dst, dst_len); } - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept final { - return set_best()->stage1(buf, len, parser, streaming); - } - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final { - return set_best()->stage2(buf, len, parser); - } - WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept final { - return set_best()->stage2(buf, len, parser, next_json); - } really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} private: @@ -532,21 +528,16 @@ const std::initializer_list available_implementation_poi // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support class unsupported_implementation final : public implementation { public: - WARN_UNUSED error_code parse(const uint8_t *, size_t, dom::parser &) const noexcept final { + WARN_UNUSED error_code create_dom_parser_implementation( + size_t, + size_t, + std::unique_ptr& + ) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } - WARN_UNUSED error_code stage1(const uint8_t *, size_t, dom::parser &, bool) const noexcept final { - return UNSUPPORTED_ARCHITECTURE; - } - WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &) const noexcept final { - return UNSUPPORTED_ARCHITECTURE; - } - WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &, size_t &) const noexcept final { - return UNSUPPORTED_ARCHITECTURE; - } unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} }; @@ -1942,7 +1933,151 @@ const uint64_t mantissa_128[] = { /* simdprune_tables.h already included: #include "simdprune_tables.h" */ #if SIMDJSON_IMPLEMENTATION_ARM64 -/* begin file src/arm64/stage1.cpp */ +/* begin file src/arm64/implementation.cpp */ +/* arm64/implementation.h already included: #include "arm64/implementation.h" */ +/* begin file src/arm64/dom_parser_implementation.h */ +#ifndef SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H + +/* isadetection.h already included: #include "isadetection.h" */ + +namespace simdjson { +namespace arm64 { + +/* begin file src/generic/dom_parser_implementation.h */ +// expectation: sizeof(scope_descriptor) = 64/8. +struct scope_descriptor { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct scope_descriptor + +#ifdef SIMDJSON_USE_COMPUTED_GOTO +typedef void* ret_address_t; +#else +typedef char ret_address_t; +#endif + +class dom_parser_implementation final : public internal::dom_parser_implementation { +public: + /** Tape location of each open { or [ */ + std::unique_ptr containing_scope{}; + /** Return address of each open { or [ */ + std::unique_ptr ret_address{}; + /** Buffer passed to stage 1 */ + const uint8_t *buf{}; + /** Length passed to stage 1 */ + size_t len{0}; + /** Document passed to stage 2 */ + dom::document *doc{}; + /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ + error_code error{UNINITIALIZED}; + + really_inline dom_parser_implementation(); + dom_parser_implementation(const dom_parser_implementation &) = delete; + dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; + + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + WARN_UNUSED error_code check_for_unclosed_array() noexcept; + WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; + WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; + WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; + WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; +}; + +/* begin file src/generic/stage1/allocate.h */ +namespace stage1 { +namespace allocate { + +// +// Allocates stage 1 internal state and outputs in the parser +// +really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { + size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; + parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); + if (!parser.structural_indexes) { return MEMALLOC; } + parser.structural_indexes[0] = 0; + parser.n_structural_indexes = 0; + return SUCCESS; +} + +} // namespace allocate +} // namespace stage1 +/* end file src/generic/stage1/allocate.h */ +/* begin file src/generic/stage2/allocate.h */ +namespace stage2 { +namespace allocate { + +// +// Allocates stage 2 internal state and outputs in the parser +// +really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { + parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); + parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); + + if (!parser.ret_address || !parser.containing_scope) { + return MEMALLOC; + } + return SUCCESS; +} + +} // namespace allocate +} // namespace stage2 +/* end file src/generic/stage2/allocate.h */ + +really_inline dom_parser_implementation::dom_parser_implementation() {} + +// Leaving these here so they can be inlined if so desired +WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { + error_code err = stage1::allocate::set_capacity(*this, capacity); + if (err) { _capacity = 0; return err; } + _capacity = capacity; + return SUCCESS; +} + +WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { + error_code err = stage2::allocate::set_max_depth(*this, max_depth); + if (err) { _max_depth = 0; return err; } + _max_depth = max_depth; + return SUCCESS; +} +/* end file src/generic/stage2/allocate.h */ + +} // namespace arm64 +} // namespace simdjson + +#endif // SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H +/* end file src/generic/stage2/allocate.h */ + +TARGET_HASWELL + +namespace simdjson { +namespace arm64 { + +WARN_UNUSED error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + dst->set_capacity(capacity); + dst->set_max_depth(max_depth); + return SUCCESS; +} + +} // namespace arm64 +} // namespace simdjson + +UNTARGET_REGION +/* end file src/generic/stage2/allocate.h */ +/* begin file src/arm64/dom_parser_implementation.cpp */ +/* arm64/implementation.h already included: #include "arm64/implementation.h" */ +/* arm64/dom_parser_implementation.h already included: #include "arm64/dom_parser_implementation.h" */ + +// +// Stage 1 +// /* begin file src/arm64/bitmask.h */ #ifndef SIMDJSON_ARM64_BITMASK_H #define SIMDJSON_ARM64_BITMASK_H @@ -2594,7 +2729,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_ #endif // SIMDJSON_ARM64_SIMD_H /* end file src/arm64/bitmanipulation.h */ /* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */ -/* arm64/implementation.h already included: #include "arm64/implementation.h" */ namespace simdjson { namespace arm64 { @@ -2665,24 +2799,21 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - really_inline size_t block_index() { return idx; } - really_inline bool has_full_block() const { - return idx < lenminusstep; - } - really_inline const uint8_t *full_block() const { - return &buf[idx]; - } - really_inline bool has_remainder() const { - return idx < len; - } - really_inline void get_remainder(uint8_t *tmp_buf) const { - memset(tmp_buf, 0x20, STEP_SIZE); - memcpy(tmp_buf, buf + idx, len - idx); - } - really_inline void advance() { - idx += STEP_SIZE; - } + really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + really_inline size_t block_index(); + really_inline bool has_full_block() const; + really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + really_inline size_t get_remainder(uint8_t *dst) const; + really_inline void advance(); private: const uint8_t *buf; const size_t len; @@ -2690,6 +2821,18 @@ struct buf_block_reader { size_t idx; }; +constexpr const int TITLE_SIZE = 12; + +// Routines to print masks and text for debugging bitmask operations +UNUSED static char * format_input_text_64(const uint8_t *text) { + static char *buf = (char*)malloc(sizeof(simd8x64) + 1); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -2709,6 +2852,34 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } + +template +really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +really_inline size_t buf_block_reader::block_index() { return idx; } + +template +really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -3008,13 +3179,15 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); + + // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); + // Index the last (remainder) block, padded with spaces + uint8_t block[STEP_SIZE]; + if (likely(reader.get_remainder(block)) > 0) { minifier.step(block, reader); } @@ -3027,6 +3200,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } +/* begin file src/generic/stage1/find_next_document_index.h */ +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { + // TODO don't count separately, just figure out depth + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + return 0; +} + +// Skip the last character if it is partial +really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} +/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -3077,9 +3338,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -3094,6 +3355,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui using namespace simd; namespace utf8_validation { + // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -3138,7 +3400,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -3169,214 +3431,6 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } - // - // Validate the length of multibyte characters (that each multibyte character has the right number - // of continuation characters, and that all continuation characters are part of a multibyte - // character). - // - // Algorithm - // ========= - // - // This algorithm compares *expected* continuation characters with *actual* continuation bytes, - // and emits an error anytime there is a mismatch. - // - // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte - // characters, the file will look like this: - // - // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | - // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| - // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | - // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | - // | is_second_byte | | X | | | | X | | | X | | | - // | is_third_byte | | | X | | | | X | | | | | - // | is_fourth_byte | | | | X | | | | | | | | - // | expected_continuation | | X | X | X | | X | X | | X | | | - // | is_continuation | | X | X | X | | X | X | | X | | | - // - // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): - // - // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not - // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just - // floating around extra outside of any character, or that there is an illegal 5-byte character, - // or maybe it's at the beginning of the file before any characters have started; but it's an - // error in all these cases. - // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means - // we started a new character before we were finished with the current one. - // - // Getting the Previous Bytes - // -------------------------- - // - // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte - // character, we need to "shift the bytes" to find that out. This is what they mean: - // - // - `is_continuation`: if the current byte is a continuation. - // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. - // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. - // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. - // - // We use shuffles to go n bytes back, selecting part of the current `input` and part of the - // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller - // function, because the 1-byte-back data is used by other checks as well. - // - // Getting the Continuation Mask - // ----------------------------- - // - // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as - // numbers, using signed `<` and `>` operations to check if they are continuations or leads. - // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because - // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). - // - // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," - // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. - // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. - // - // When treated as signed numbers, they look like this: - // - // | Type | High Bits | Binary Range | Signed | - // |--------------|------------|--------------|--------| - // | ASCII | `0` | `01111111` | 127 | - // | | | `00000000` | 0 | - // | 4+-Byte Lead | `1111` | `11111111` | -1 | - // | | | `11110000 | -16 | - // | 3-Byte Lead | `1110` | `11101111` | -17 | - // | | | `11100000 | -32 | - // | 2-Byte Lead | `110` | `11011111` | -33 | - // | | | `11000000 | -64 | - // | Continuation | `10` | `10111111` | -65 | - // | | | `10000000 | -128 | - // - // This makes it pretty easy to get the continuation mask! It's just a single comparison: - // - // ``` - // is_continuation = input < -64` - // ``` - // - // We can do something similar for the others, but it takes two comparisons instead of one: "is - // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and - // `> -64`. Surely we can do better, they're right next to each other! - // - // Getting the is_xxx Masks: Shifting the Range - // -------------------------------------------- - // - // Notice *why* continuations were a single comparison. The actual *range* would require two - // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get - // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be - // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. - // - // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps - // ASCII down into the negative, and puts 4+-Byte Lead at the top: - // - // | Type | High Bits | Binary Range | Signed | - // |----------------------|------------|--------------|-------| - // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | - // | | | `01110000 | 112 | - // |----------------------|------------|--------------|-------| - // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | - // | | | `01100000 | 96 | - // |----------------------|------------|--------------|-------| - // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | - // | | | `01000000 | 64 | - // |----------------------|------------|--------------|-------| - // | Continuation (+ 127) | `00` | `00111111` | 63 | - // | | | `00000000 | 0 | - // |----------------------|------------|--------------|-------| - // | ASCII (+ 127) | `1` | `11111111` | -1 | - // | | | `10000000` | -128 | - // |----------------------|------------|--------------|-------| - // - // *Now* we can use signed `>` on all of them: - // - // ``` - // prev1 = input.prev<1> - // prev2 = input.prev<2> - // prev3 = input.prev<3> - // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` - // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` - // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` - // is_second_byte = prev1_flipped > 63; // 2+-byte lead - // is_third_byte = prev2_flipped > 95; // 3+-byte lead - // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead - // ``` - // - // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number - // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 - // `^`'s at a time on Haswell, but only 2 `+`'s). - // - // That doesn't look like it saved us any instructions, did it? Well, because we're adding the - // same number to all of them, we can save one of those `+ 128` operations by assembling - // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 - // to it. One more instruction saved! - // - // ``` - // prev1 = input.prev<1> - // prev3 = input.prev<3> - // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` - // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` - // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or - // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can - // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and - // then adds the result together. Same number of operations, but if the processor can run - // independent things in parallel (which most can), it runs faster. - // - // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have - // a super nice advantage in that more of them can be run at the same time (they can run on 3 - // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, - // saving us the cycle we would have earned by using +. Even more, using an instruction with a - // wider array of ports can help *other* code run ahead, too, since these instructions can "get - // out of the way," running on a port other instructions can't. - // - // Epilogue II: One More Trick - // --------------------------- - // - // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay - // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in - // check_special_cases()--but we'll talk about that there :) - // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -3514,16 +3568,22 @@ class bit_indexer { class json_structural_indexer { public: + /** + * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. + * + * @param partial Setting the partial parameter to true allows the find_structural_bits to + * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If + * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. + */ template - static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes) - : indexer{structural_indexes} {} + really_inline json_structural_indexer(uint32_t *structural_indexes); template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); + really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); json_scanner scanner{}; utf8_checker checker{}; @@ -3532,42 +3592,44 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} +really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} -really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +template +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { + if (unlikely(len > parser.capacity())) { return CAPACITY; } + if (partial) { len = trim_partial_utf8(buf, len); } - error_code error = scanner.finish(streaming); - if (unlikely(error != SUCCESS)) { return error; } + buf_block_reader reader(buf, len); + json_structural_indexer indexer(parser.structural_indexes.get()); - if (unescaped_chars_error) { - return UNESCAPED_CHARS; + // Read all but the last block + while (reader.has_full_block()) { + indexer.step(reader.full_block(), reader); } - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /* a valid JSON file cannot have zero structural indexes - we should have - * found something */ - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } - if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { - /* the string might not be NULL terminated, but we add a virtual NULL - * ending character. */ - parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); - } - /* make it safe to dereference one beyond this array */ - parser.structural_indexes[parser.n_structural_indexes] = 0; - return checker.errors(); + // Take care of the last block (will always be there unless file is empty) + uint8_t block[STEP_SIZE]; + if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + indexer.step(block, reader); + + return indexer.finish(parser, reader.block_index(), len, partial); } template<> @@ -3589,61 +3651,76 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b reader.advance(); } -// -// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. -// -// PERF NOTES: -// We pipe 2 inputs through these stages: -// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load -// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. -// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. -// The output of step 1 depends entirely on this information. These functions don't quite use -// up enough CPU: the second half of the functions is highly serial, only using 1 execution core -// at a time. The second input's scans has some dependency on the first ones finishing it, but -// they can make a lot of progress before they need that information. -// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that -// to finish: utf-8 checks and generating the output from the last iteration. -// -// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all -// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough -// workout. -// -// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. -// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, -// you may want to call on a function like trimmed_length_safe_utf8. -template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { - if (unlikely(len > parser.capacity())) { return CAPACITY; } +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} - buf_block_reader reader(buf, len); - json_structural_indexer indexer(parser.structural_indexes.get()); - while (reader.has_full_block()) { - indexer.step(reader.full_block(), reader); - } +really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); + + error_code error = scanner.finish(partial); + if (unlikely(error != SUCCESS)) { return error; } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); - indexer.step(block, reader); + if (unescaped_chars_error) { + return UNESCAPED_CHARS; } - return indexer.finish(parser, reader.block_index(), len, streaming); + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /*** + * This is related to https://github.com/simdjson/simdjson/issues/906 + * Basically, we want to make sure that if the parsing continues beyond the last (valid) + * structural character, it quickly stops. + * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. + * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing + * continues, then it must be [,] or }. + * Suppose it is ] or }. We backtrack to the first character, what could it be that would + * not trigger an error? It could be ] or } but no, because you can't start a document that way. + * It can't be a comma, a colon or any simple value. So the only way we could continue is + * if the repeated character is [. But if so, the document must start with [. But if the document + * starts with [, it should end with ]. If we enforce that rule, then we would get + * ][[ which is invalid. + **/ + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 2] = 0; + parser.next_structural_index = 0; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (partial) { + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + } + parser.n_structural_indexes = new_structural_indexes; + } + return checker.errors(); } } // namespace stage1 /* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { - return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); +WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { + this->buf = _buf; + this->len = _len; + return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); } } // namespace arm64 } // namespace simdjson -/* end file src/generic/stage1/json_structural_indexer.h */ -/* begin file src/arm64/stage2.cpp */ -#ifndef SIMDJSON_ARM64_STAGE2_H -#define SIMDJSON_ARM64_STAGE2_H -/* arm64/implementation.h already included: #include "arm64/implementation.h" */ +// +// Stage 2 +// + /* begin file src/arm64/stringparsing.h */ #ifndef SIMDJSON_ARM64_STRINGPARSING_H #define SIMDJSON_ARM64_STRINGPARSING_H @@ -4049,10 +4126,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 × + // can't fit in binary64. The maximal value is about 1.7976931348623157 x // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe.  The estimate for the total number + // are about 10^80 atoms in the universe. The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -4072,26 +4149,6 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } -// We need to check that the character following a zero is valid. This is -// probably frequent and it is harder than it looks. We are building all of this -// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... -const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - -really_inline bool -is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { - return structural_or_whitespace_or_exponent_or_decimal_negated[c]; -} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -4169,14 +4226,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -4189,12 +4246,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.write_s64(i); + writer.append_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.write_u64(i); + writer.append_u64(i); } } return is_structural_or_whitespace(*p); @@ -4204,7 +4261,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -4228,10 +4285,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W writer) { + W &writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.write_s64(0); // always write zero + writer.append_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -4251,7 +4308,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { + if (is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -4375,7 +4432,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (digit_count >= 19) { + if (unlikely(digit_count >= 19)) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -4383,14 +4440,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_double(); + return success; } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer when we passed it to the + // slow_float_parsing() function, so we have to skip those tape spots now that we've returned + writer.skip_double(); + return success; } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -4399,7 +4464,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -4414,10 +4479,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - return parse_large_integer(src, writer, found_minus); + bool success = parse_large_integer(src, writer, found_minus); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_large_integer(); + return success; } i = negative ? 0 - i : i; - writer.write_s64(i); + writer.append_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -4439,6 +4508,72 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, namespace simdjson { namespace arm64 { +/* begin file src/generic/stage2/logger.h */ +// This is for an internal-only stage 2 specific logger. +// Set LOG_ENABLED = true to log what stage 2 is doing! +namespace logger { + static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; + + static constexpr const bool LOG_ENABLED = false; + static constexpr const int LOG_EVENT_LEN = 30; + static constexpr const int LOG_BUFFER_LEN = 20; + static constexpr const int LOG_DETAIL_LEN = 50; + static constexpr const int LOG_INDEX_LEN = 10; + + static int log_depth; // Not threadsafe. Log only. + + // Helper to turn unprintable or newline characters into spaces + static really_inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } + } + + // Print the header and set up log_start + static really_inline void log_start() { + if (LOG_ENABLED) { + log_depth = 0; + printf("\n"); + printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); + printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); + } + } + + static really_inline void log_string(const char *message) { + if (LOG_ENABLED) { + printf("%s\n", message); + } + } + + // Logs a single line of + template + static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { + if (LOG_ENABLED) { + printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + { + // Print the next N characters in the buffer. + printf("| "); + // Otherwise, print the characters starting from the buffer position. + // Print spaces for unprintable or newline characters. + for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -4533,32 +4676,25 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), idx); + memcpy(copy, buf, parser.len); + memset(copy + parser.len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), *current_structural); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return next_structural+1 > n_structural_indexes; + return current_structural >= &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_end(uint32_t n_structural_indexes) { - return next_structural+1 == n_structural_indexes; + return current_structural == &parser.structural_indexes[n_structural_indexes]; } - really_inline size_t next_structural_index() { - return next_structural; + really_inline bool at_beginning() { + return current_structural == parser.structural_indexes.get(); } - - const uint8_t* const buf; - const size_t len; - const uint32_t* const structural_indexes; - size_t next_structural; // next structural index - size_t idx{0}; // location of the structural character in the input (buf) - uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -4570,8 +4706,105 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { +namespace { // Make everything here private + +/* begin file src/generic/stage2/tape_writer.h */ +struct tape_writer { + /** The next place to write to tape */ + uint64_t *next_tape_loc; + + /** Write a signed 64-bit value to tape. */ + really_inline void append_s64(int64_t value) noexcept; + + /** Write an unsigned 64-bit value to tape. */ + really_inline void append_u64(uint64_t value) noexcept; + + /** Write a double value to tape. */ + really_inline void append_double(double value) noexcept; + + /** + * Append a tape entry (an 8-bit type,and 56 bits worth of value). + */ + really_inline void append(uint64_t val, internal::tape_type t) noexcept; + + /** + * Skip the current tape entry without writing. + * + * Used to skip the start of the container, since we'll come back later to fill it in when the + * container ends. + */ + really_inline void skip() noexcept; + + /** + * Skip the number of tape entries necessary to write a large u64 or i64. + */ + really_inline void skip_large_integer() noexcept; + + /** + * Skip the number of tape entries necessary to write a double. + */ + really_inline void skip_double() noexcept; + + /** + * Write a value to a known location on tape. + * + * Used to go back and write out the start of a container after the container ends. + */ + really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; + +private: + /** + * Append both the tape entry, and a supplementary value following it. Used for types that need + * all 64 bits, such as double and uint64_t. + */ + template + really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; +}; // struct number_writer + +really_inline void tape_writer::append_s64(int64_t value) noexcept { + append2(0, value, internal::tape_type::INT64); +} + +really_inline void tape_writer::append_u64(uint64_t value) noexcept { + append(0, internal::tape_type::UINT64); + *next_tape_loc = value; + next_tape_loc++; +} + +/** Write a double value to tape. */ +really_inline void tape_writer::append_double(double value) noexcept { + append2(0, value, internal::tape_type::DOUBLE); +} -using internal::ret_address; +really_inline void tape_writer::skip() noexcept { + next_tape_loc++; +} + +really_inline void tape_writer::skip_large_integer() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::skip_double() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { + *next_tape_loc = val | ((uint64_t(char(t))) << 56); + next_tape_loc++; +} + +template +really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { + append(val, t); + static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); + memcpy(next_tape_loc, &val2, sizeof(val2)); + next_tape_loc++; +} + +really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { + tape_loc = val | ((uint64_t(char(t))) << 56); +} +/* end file src/generic/stage2/tape_writer.h */ #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -4602,102 +4835,88 @@ using internal::ret_address; #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address array_begin; - ret_address array_continue; - ret_address error; - ret_address finish; - ret_address object_begin; - ret_address object_continue; + ret_address_t array_begin; + ret_address_t array_continue; + ret_address_t error; + ret_address_t finish; + ret_address_t object_begin; + ret_address_t object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct number_writer { - parser &doc_parser; - - really_inline void write_s64(int64_t value) noexcept { - write_tape(0, internal::tape_type::INT64); - std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); - ++doc_parser.current_loc; - } - really_inline void write_u64(uint64_t value) noexcept { - write_tape(0, internal::tape_type::UINT64); - doc_parser.doc.tape[doc_parser.current_loc++] = value; - } - really_inline void write_double(double value) noexcept { - write_tape(0, internal::tape_type::DOUBLE); - static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); - memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); - // doc.tape[doc.current_loc++] = *((uint64_t *)&d); - } - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); - } -}; // struct number_writer - -struct structural_parser { - structural_iterator structurals; - parser &doc_parser; +struct structural_parser : structural_iterator { + /** Lets you append to the tape */ + tape_writer tape; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc{}; - uint32_t depth; - - really_inline structural_parser( - const uint8_t *buf, - size_t len, - parser &_doc_parser, - uint32_t next_structural = 0 - ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} - - WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { - doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; - doc_parser.containing_scope[depth].count = 0; - write_tape(0, type); // if the document is correct, this gets rewritten later - doc_parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc; + /** Current depth (nested objects and arrays) */ + uint32_t depth{0}; + + // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations + really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) + : structural_iterator(_parser, start_structural_index), + tape{parser.doc->tape.get()}, + current_string_buf_loc{parser.doc->string_buf.get()} { + } + + WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { + parser.containing_scope[depth].tape_index = next_tape_index(); + parser.containing_scope[depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. + parser.ret_address[depth] = continue_state; depth++; - return depth >= doc_parser.max_depth(); + bool exceeded_max_depth = depth >= parser.max_depth(); + if (exceeded_max_depth) { log_error("Exceeded max depth!"); } + return exceeded_max_depth; } - WARN_UNUSED really_inline bool start_document(ret_address continue_state) { - return start_scope(internal::tape_type::ROOT, continue_state); + WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { + log_start_value("document"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address continue_state) { - return start_scope(internal::tape_type::START_OBJECT, continue_state); + WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { + log_start_value("object"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address continue_state) { - return start_scope(internal::tape_type::START_ARRAY, continue_state); + WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { + log_start_value("array"); + return start_scope(continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type type) noexcept { + really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { depth--; - // write our doc.tape location to the header scope + // write our doc->tape location to the header scope // The root scope gets written *at* the previous location. - write_tape(doc_parser.containing_scope[depth].tape_index, type); + tape.append(parser.containing_scope[depth].tape_index, end); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; - const uint32_t count = doc_parser.containing_scope[depth].count; + const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; + const uint32_t count = parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] - doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); + // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] + tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); + } + + really_inline uint32_t next_tape_index() { + return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); } really_inline void end_object() { - end_scope(internal::tape_type::END_OBJECT); + log_end_value("object"); + end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); } really_inline void end_array() { - end_scope(internal::tape_type::END_ARRAY); + log_end_value("array"); + end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); } really_inline void end_document() { - end_scope(internal::tape_type::ROOT); - } - - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + log_end_value("document"); + end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); } // increment_count increments the count of keys in an object or values in an array. @@ -4705,17 +4924,16 @@ struct structural_parser { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - /* we advance the point, accounting for the fact that we have a NULL - * termination */ - write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline bool on_end_string(uint8_t *dst) noexcept { + really_inline void on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -4725,73 +4943,49 @@ struct structural_parser { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; - return true; } - WARN_UNUSED really_inline bool parse_string() { + WARN_UNUSED really_inline bool parse_string(bool key = false) { + log_value(key ? "key" : "string"); uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(structurals.current(), dst); + dst = stringparsing::parse_string(current(), dst); if (dst == nullptr) { + log_error("Invalid escape in string"); return true; } - return !on_end_string(dst); + on_end_string(dst); + return false; } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - number_writer writer{doc_parser}; - return !numberparsing::parse_number(src, found_minus, writer); + log_value("number"); + bool succeeded = numberparsing::parse_number(src, found_minus, tape); + if (!succeeded) { log_error("Invalid number"); } + return !succeeded; } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(structurals.current(), found_minus); - } - - WARN_UNUSED really_inline bool parse_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; + return parse_number(current(), found_minus); } - WARN_UNUSED really_inline bool parse_single_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; - } - - WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { - switch (structurals.current_char()) { + WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { + switch (advance_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': case 'f': case 'n': - FAIL_IF( parse_atom() ); + case 't': + log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); + tape.append(0, internal::tape_type::TRUE_VALUE); + return continue_state; + case 'f': + log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); + tape.append(0, internal::tape_type::FALSE_VALUE); + return continue_state; + case 'n': + log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); + tape.append(0, internal::tape_type::NULL_VALUE); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -4807,40 +5001,27 @@ struct structural_parser { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: + log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { - // the string might not be NULL terminated. - if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } end_document(); + parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); + if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); + log_error("Unclosed objects or arrays!"); + return parser.error = TAPE_ERROR; } - return on_success(SUCCESS); - } - - really_inline error_code on_error(error_code new_error_code) noexcept { - doc_parser.error = new_error_code; - return new_error_code; - } - really_inline error_code on_success(error_code success_code) noexcept { - doc_parser.error = success_code; - doc_parser.valid = true; - return success_code; + return SUCCESS; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by doc_parser.init_stage2(), + /* We do not need the next line because this is done by parser.init_stage2(), * pessimistically. - * doc_parser.is_valid = false; + * parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -4848,12 +5029,12 @@ struct structural_parser { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= doc_parser.max_depth()) { - return on_error(DEPTH_ERROR); + if (depth >= parser.max_depth()) { + return parser.error = DEPTH_ERROR; } - switch (structurals.current_char()) { + switch (current_char()) { case '"': - return on_error(STRING_ERROR); + return parser.error = STRING_ERROR; case '0': case '1': case '2': @@ -4865,92 +5046,124 @@ struct structural_parser { case '8': case '9': case '-': - return on_error(NUMBER_ERROR); + return parser.error = NUMBER_ERROR; case 't': - return on_error(T_ATOM_ERROR); + return parser.error = T_ATOM_ERROR; case 'n': - return on_error(N_ATOM_ERROR); + return parser.error = N_ATOM_ERROR; case 'f': - return on_error(F_ATOM_ERROR); + return parser.error = F_ATOM_ERROR; default: - return on_error(TAPE_ERROR); + return parser.error = TAPE_ERROR; } } really_inline void init() { - current_string_buf_loc = doc_parser.doc.string_buf.get(); - doc_parser.current_loc = 0; - doc_parser.valid = false; - doc_parser.error = UNINITIALIZED; + log_start(); + parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { - init(); // sets is_valid to false - if (len > doc_parser.capacity()) { - return CAPACITY; + WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { + // If there are no structurals left, return EMPTY + if (at_end(parser.n_structural_indexes)) { + return parser.error = EMPTY; } - // Advance to the first character as soon as possible - structurals.advance_char(); + + init(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return on_error(DEPTH_ERROR); + return parser.error = DEPTH_ERROR; } return SUCCESS; } - really_inline char advance_char() { - return structurals.advance_char(); + really_inline void log_value(const char *type) { + logger::log_line(*this, "", type, ""); } -}; + + static really_inline void log_start() { + logger::log_start(); + } + + really_inline void log_start_value(const char *type) { + logger::log_line(*this, "+", type, ""); + if (logger::LOG_ENABLED) { logger::log_depth++; } + } + + really_inline void log_end_value(const char *type) { + if (logger::LOG_ENABLED) { logger::log_depth--; } + logger::log_line(*this, "-", type, ""); + } + + really_inline void log_error(const char *error) { + logger::log_line(*this, "", "ERROR", error); + } +}; // struct structural_parser // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -} // namespace stage2 - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { +template +WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { + dom_parser.doc = &doc; static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(buf, len, doc_parser); - error_code result = parser.start(len, addresses.finish); + stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + error_code result = parser.start(addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.structurals.current_char()) { + switch (parser.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); + // Make sure the outer array is closed before continuing; otherwise, there are ways we could get + // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + goto error; + } + } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); + case 't': + parser.log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::TRUE_VALUE); + goto finish; + case 'f': + parser.log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::FALSE_VALUE); + goto finish; + case 'n': + parser.log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::NULL_VALUE); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: + parser.log_error("Document starts with a non-value character"); goto error; } @@ -4961,43 +5174,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string() ); + FAIL_IF( parser.parse_string(true) ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: + parser.log_error("Object does not start with a key"); goto error; } object_key_state: - FAIL_IF( parser.advance_char() != ':' ); - parser.advance_char(); + if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); + if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } + FAIL_IF( parser.parse_string(true) ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: + parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + CONTINUE( parser.parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.advance_char() == ']') { + if (parser.peek_next_char() == ']') { + parser.advance_char(); parser.end_array(); goto scope_end; } @@ -5012,12 +5227,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case ',': parser.increment_count(); - parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: + parser.log_error("Missing comma between array values"); goto error; } @@ -5028,194 +5243,298 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa return parser.error(); } -WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { - error_code code = stage1(buf, len, doc_parser, false); - if (!code) { - code = stage2(buf, len, doc_parser); - } - return code; -} -/* end file src/generic/stage2/structural_parser.h */ -/* begin file src/generic/stage2/streaming_structural_parser.h */ -namespace stage2 { - -struct streaming_structural_parser: structural_parser { - really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} - - // override to add streaming - WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { - init(); // sets is_valid to false - // Capacity ain't no thang for streaming, so we don't check it. - // Advance to the first character as soon as possible - advance_char(); - // Push the root scope (there is always at least one scope) - if (start_document(finish_parser)) { - return on_error(DEPTH_ERROR); - } - return SUCCESS; - } - - // override to add streaming - WARN_UNUSED really_inline error_code finish() { - if ( structurals.past_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } - end_document(); - if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - bool finished = structurals.at_end(doc_parser.n_structural_indexes); - return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); - } -}; - +} // namespace {} } // namespace stage2 /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { - static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); - error_code result = parser.start(len, addresses.finish); +WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + error_code result = stage2::parse_structurals(*this, _doc); if (result) { return result; } - // - // Read first value - // - switch (parser.structurals.current_char()) { - case '{': - FAIL_IF( parser.start_object(addresses.finish) ); - goto object_begin; - case '[': - FAIL_IF( parser.start_array(addresses.finish) ); - goto array_begin; - case '"': - FAIL_IF( parser.parse_string() ); - goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); - goto finish; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], false); - }) - ); - goto finish; - case '-': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], true); - }) - ); - goto finish; - default: - goto error; - } - -// -// Object parser parsers -// -object_begin: - switch (parser.advance_char()) { - case '"': { - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - } - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -object_key_parser: - FAIL_IF( parser.advance_char() != ':' ); - parser.increment_count(); - parser.advance_char(); - GOTO( parser.parse_value(addresses, addresses.object_continue) ); - -object_continue: - switch (parser.advance_char()) { - case ',': - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); -// -// Array parser parsers -// -array_begin: - if (parser.advance_char() == ']') { - parser.end_array(); - goto scope_end; + // If we didn't make it to the end, it's an error + if ( next_structural_index != n_structural_indexes ) { + logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return error = TAPE_ERROR; } - parser.increment_count(); - -main_array_switch: - /* we call update char on all paths in, so we can peek at parser.c on the - * on paths that can accept a close square brace (post-, and at start) */ - GOTO( parser.parse_value(addresses, addresses.array_continue) ); -array_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - parser.advance_char(); - goto main_array_switch; - case ']': - parser.end_array(); - goto scope_end; - default: - goto error; - } + return SUCCESS; +} -finish: - next_json = parser.structurals.next_structural_index(); - return parser.finish(); +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::parse_structurals(*this, _doc); +} +/* end file src/generic/stage2/tape_writer.h */ -error: - return parser.error(); +WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + error_code err = stage1(_buf, _len, false); + if (err) { return err; } + return stage2(_doc); } -/* end file src/generic/stage2/streaming_structural_parser.h */ } // namespace arm64 } // namespace simdjson - -#endif // SIMDJSON_ARM64_STAGE2_H -/* end file src/generic/stage2/streaming_structural_parser.h */ +/* end file src/generic/stage2/tape_writer.h */ #endif #if SIMDJSON_IMPLEMENTATION_FALLBACK -/* begin file src/fallback/stage1.cpp */ +/* begin file src/fallback/implementation.cpp */ /* fallback/implementation.h already included: #include "fallback/implementation.h" */ +/* begin file src/fallback/dom_parser_implementation.h */ +#ifndef SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H + +/* isadetection.h already included: #include "isadetection.h" */ namespace simdjson { namespace fallback { -namespace stage1 { -class structural_scanner { +/* begin file src/generic/dom_parser_implementation.h */ +// expectation: sizeof(scope_descriptor) = 64/8. +struct scope_descriptor { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct scope_descriptor + +#ifdef SIMDJSON_USE_COMPUTED_GOTO +typedef void* ret_address_t; +#else +typedef char ret_address_t; +#endif + +class dom_parser_implementation final : public internal::dom_parser_implementation { public: + /** Tape location of each open { or [ */ + std::unique_ptr containing_scope{}; + /** Return address of each open { or [ */ + std::unique_ptr ret_address{}; + /** Buffer passed to stage 1 */ + const uint8_t *buf{}; + /** Length passed to stage 1 */ + size_t len{0}; + /** Document passed to stage 2 */ + dom::document *doc{}; + /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ + error_code error{UNINITIALIZED}; + + really_inline dom_parser_implementation(); + dom_parser_implementation(const dom_parser_implementation &) = delete; + dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; + + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + WARN_UNUSED error_code check_for_unclosed_array() noexcept; + WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; + WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; + WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; + WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; +}; -really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming) - : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {} +/* begin file src/generic/stage1/allocate.h */ +namespace stage1 { +namespace allocate { -really_inline void add_structural() { - *next_structural_index = idx; - next_structural_index++; +// +// Allocates stage 1 internal state and outputs in the parser +// +really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { + size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; + parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); + if (!parser.structural_indexes) { return MEMALLOC; } + parser.structural_indexes[0] = 0; + parser.n_structural_indexes = 0; + return SUCCESS; +} + +} // namespace allocate +} // namespace stage1 +/* end file src/generic/stage1/allocate.h */ +/* begin file src/generic/stage2/allocate.h */ +namespace stage2 { +namespace allocate { + +// +// Allocates stage 2 internal state and outputs in the parser +// +really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { + parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); + parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); + + if (!parser.ret_address || !parser.containing_scope) { + return MEMALLOC; + } + return SUCCESS; +} + +} // namespace allocate +} // namespace stage2 +/* end file src/generic/stage2/allocate.h */ + +really_inline dom_parser_implementation::dom_parser_implementation() {} + +// Leaving these here so they can be inlined if so desired +WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { + error_code err = stage1::allocate::set_capacity(*this, capacity); + if (err) { _capacity = 0; return err; } + _capacity = capacity; + return SUCCESS; +} + +WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { + error_code err = stage2::allocate::set_max_depth(*this, max_depth); + if (err) { _max_depth = 0; return err; } + _max_depth = max_depth; + return SUCCESS; +} +/* end file src/generic/stage2/allocate.h */ + +} // namespace fallback +} // namespace simdjson + +#endif // SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H +/* end file src/generic/stage2/allocate.h */ + +TARGET_HASWELL + +namespace simdjson { +namespace fallback { + +WARN_UNUSED error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + dst->set_capacity(capacity); + dst->set_max_depth(max_depth); + return SUCCESS; +} + +} // namespace fallback +} // namespace simdjson + +UNTARGET_REGION +/* end file src/generic/stage2/allocate.h */ +/* begin file src/fallback/dom_parser_implementation.cpp */ +/* fallback/implementation.h already included: #include "fallback/implementation.h" */ +/* fallback/dom_parser_implementation.h already included: #include "fallback/dom_parser_implementation.h" */ + +// +// Stage 1 +// +namespace simdjson { +namespace fallback { +namespace stage1 { + +/* begin file src/generic/stage1/find_next_document_index.h */ +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { + // TODO don't count separately, just figure out depth + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + return 0; +} + +// Skip the last character if it is partial +really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} +/* end file src/generic/stage1/find_next_document_index.h */ + +class structural_scanner { +public: + +really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial) + : buf{_parser.buf}, + next_structural_index{_parser.structural_indexes.get()}, + parser{_parser}, + len{static_cast(_parser.len)}, + partial{_partial} { +} + +really_inline void add_structural() { + *next_structural_index = idx; + next_structural_index++; } really_inline bool is_continuation(uint8_t c) { @@ -5234,7 +5553,12 @@ really_inline void validate_utf8_character() { // 2-byte if ((buf[idx] & 0b00100000) == 0) { // missing continuation - if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { + if (idx+1 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 1100000_ 10______ if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; } idx += 2; @@ -5244,7 +5568,12 @@ really_inline void validate_utf8_character() { // 3-byte if ((buf[idx] & 0b00010000) == 0) { // missing continuation - if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { + if (idx+2 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 11100000 100_____ ________ if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; } // surrogates: U+D800-U+DFFF 11101101 101_____ @@ -5255,7 +5584,12 @@ really_inline void validate_utf8_character() { // 4-byte // missing continuation - if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; } + if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { + if (idx+2 > len && partial) { idx = len; return; } + error = UTF8_ERROR; + idx++; + return; + } // overlong: 11110000 1000____ ________ ________ if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; } // too large: > U+10FFFF: @@ -5280,7 +5614,7 @@ really_inline void validate_string() { idx++; } } - if (idx >= len && !streaming) { error = UNCLOSED_STRING; } + if (idx >= len && !partial) { error = UNCLOSED_STRING; } } really_inline bool is_whitespace_or_operator(uint8_t c) { @@ -5321,33 +5655,46 @@ really_inline error_code scan() { break; } } - if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) { + *next_structural_index = len; + // We pad beyond. + // https://github.com/simdjson/simdjson/issues/906 + next_structural_index[1] = len; + next_structural_index[2] = 0; + parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get()); + parser.next_structural_index = 0; + + if (unlikely(parser.n_structural_indexes == 0)) { return EMPTY; } - *next_structural_index = len; - next_structural_index++; - doc_parser.n_structural_indexes = uint32_t(next_structural_index - doc_parser.structural_indexes.get()); + + if (partial) { + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + } + parser.n_structural_indexes = new_structural_indexes; + } + return error; } private: const uint8_t *buf; uint32_t *next_structural_index; - parser &doc_parser; - uint32_t idx; + dom_parser_implementation &parser; uint32_t len; - error_code error; - bool streaming; + uint32_t idx{0}; + error_code error{SUCCESS}; + bool partial; }; // structural_scanner } // namespace stage1 -WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { - if (unlikely(len > parser.capacity())) { - return CAPACITY; - } - stage1::structural_scanner scanner(buf, uint32_t(len), parser, streaming); +WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept { + this->buf = _buf; + this->len = _len; + stage1::structural_scanner scanner(*this, partial); return scanner.scan(); } @@ -5409,10 +5756,10 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui } // namespace fallback } // namespace simdjson -/* end file src/fallback/stage1.cpp */ -/* begin file src/fallback/stage2.cpp */ -/* fallback/implementation.h already included: #include "fallback/implementation.h" */ +// +// Stage 2 +// /* begin file src/fallback/stringparsing.h */ #ifndef SIMDJSON_FALLBACK_STRINGPARSING_H #define SIMDJSON_FALLBACK_STRINGPARSING_H @@ -5872,10 +6219,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 × + // can't fit in binary64. The maximal value is about 1.7976931348623157 x // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe.  The estimate for the total number + // are about 10^80 atoms in the universe. The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -5895,26 +6242,6 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } -// We need to check that the character following a zero is valid. This is -// probably frequent and it is harder than it looks. We are building all of this -// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... -const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - -really_inline bool -is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { - return structural_or_whitespace_or_exponent_or_decimal_negated[c]; -} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -5992,14 +6319,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -6012,12 +6339,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.write_s64(i); + writer.append_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.write_u64(i); + writer.append_u64(i); } } return is_structural_or_whitespace(*p); @@ -6027,7 +6354,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -6051,10 +6378,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W writer) { + W &writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.write_s64(0); // always write zero + writer.append_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -6074,7 +6401,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { + if (is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -6198,7 +6525,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (digit_count >= 19) { + if (unlikely(digit_count >= 19)) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -6206,14 +6533,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_double(); + return success; } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer when we passed it to the + // slow_float_parsing() function, so we have to skip those tape spots now that we've returned + writer.skip_double(); + return success; } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -6222,7 +6557,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -6237,10 +6572,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - return parse_large_integer(src, writer, found_minus); + bool success = parse_large_integer(src, writer, found_minus); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_large_integer(); + return success; } i = negative ? 0 - i : i; - writer.write_s64(i); + writer.append_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -6263,6 +6602,72 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, namespace simdjson { namespace fallback { +/* begin file src/generic/stage2/logger.h */ +// This is for an internal-only stage 2 specific logger. +// Set LOG_ENABLED = true to log what stage 2 is doing! +namespace logger { + static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; + + static constexpr const bool LOG_ENABLED = false; + static constexpr const int LOG_EVENT_LEN = 30; + static constexpr const int LOG_BUFFER_LEN = 20; + static constexpr const int LOG_DETAIL_LEN = 50; + static constexpr const int LOG_INDEX_LEN = 10; + + static int log_depth; // Not threadsafe. Log only. + + // Helper to turn unprintable or newline characters into spaces + static really_inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } + } + + // Print the header and set up log_start + static really_inline void log_start() { + if (LOG_ENABLED) { + log_depth = 0; + printf("\n"); + printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); + printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); + } + } + + static really_inline void log_string(const char *message) { + if (LOG_ENABLED) { + printf("%s\n", message); + } + } + + // Logs a single line of + template + static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { + if (LOG_ENABLED) { + printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + { + // Print the next N characters in the buffer. + printf("| "); + // Otherwise, print the characters starting from the buffer position. + // Print spaces for unprintable or newline characters. + for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -6357,32 +6770,25 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), idx); + memcpy(copy, buf, parser.len); + memset(copy + parser.len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), *current_structural); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return next_structural+1 > n_structural_indexes; + return current_structural >= &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_end(uint32_t n_structural_indexes) { - return next_structural+1 == n_structural_indexes; + return current_structural == &parser.structural_indexes[n_structural_indexes]; } - really_inline size_t next_structural_index() { - return next_structural; + really_inline bool at_beginning() { + return current_structural == parser.structural_indexes.get(); } - - const uint8_t* const buf; - const size_t len; - const uint32_t* const structural_indexes; - size_t next_structural; // next structural index - size_t idx{0}; // location of the structural character in the input (buf) - uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -6394,8 +6800,105 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { +namespace { // Make everything here private + +/* begin file src/generic/stage2/tape_writer.h */ +struct tape_writer { + /** The next place to write to tape */ + uint64_t *next_tape_loc; + + /** Write a signed 64-bit value to tape. */ + really_inline void append_s64(int64_t value) noexcept; + + /** Write an unsigned 64-bit value to tape. */ + really_inline void append_u64(uint64_t value) noexcept; + + /** Write a double value to tape. */ + really_inline void append_double(double value) noexcept; + + /** + * Append a tape entry (an 8-bit type,and 56 bits worth of value). + */ + really_inline void append(uint64_t val, internal::tape_type t) noexcept; + + /** + * Skip the current tape entry without writing. + * + * Used to skip the start of the container, since we'll come back later to fill it in when the + * container ends. + */ + really_inline void skip() noexcept; + + /** + * Skip the number of tape entries necessary to write a large u64 or i64. + */ + really_inline void skip_large_integer() noexcept; + + /** + * Skip the number of tape entries necessary to write a double. + */ + really_inline void skip_double() noexcept; + + /** + * Write a value to a known location on tape. + * + * Used to go back and write out the start of a container after the container ends. + */ + really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; + +private: + /** + * Append both the tape entry, and a supplementary value following it. Used for types that need + * all 64 bits, such as double and uint64_t. + */ + template + really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; +}; // struct number_writer + +really_inline void tape_writer::append_s64(int64_t value) noexcept { + append2(0, value, internal::tape_type::INT64); +} + +really_inline void tape_writer::append_u64(uint64_t value) noexcept { + append(0, internal::tape_type::UINT64); + *next_tape_loc = value; + next_tape_loc++; +} + +/** Write a double value to tape. */ +really_inline void tape_writer::append_double(double value) noexcept { + append2(0, value, internal::tape_type::DOUBLE); +} + +really_inline void tape_writer::skip() noexcept { + next_tape_loc++; +} + +really_inline void tape_writer::skip_large_integer() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::skip_double() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { + *next_tape_loc = val | ((uint64_t(char(t))) << 56); + next_tape_loc++; +} + +template +really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { + append(val, t); + static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); + memcpy(next_tape_loc, &val2, sizeof(val2)); + next_tape_loc++; +} -using internal::ret_address; +really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { + tape_loc = val | ((uint64_t(char(t))) << 56); +} +/* end file src/generic/stage2/tape_writer.h */ #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -6426,102 +6929,88 @@ using internal::ret_address; #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address array_begin; - ret_address array_continue; - ret_address error; - ret_address finish; - ret_address object_begin; - ret_address object_continue; + ret_address_t array_begin; + ret_address_t array_continue; + ret_address_t error; + ret_address_t finish; + ret_address_t object_begin; + ret_address_t object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct number_writer { - parser &doc_parser; - - really_inline void write_s64(int64_t value) noexcept { - write_tape(0, internal::tape_type::INT64); - std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); - ++doc_parser.current_loc; - } - really_inline void write_u64(uint64_t value) noexcept { - write_tape(0, internal::tape_type::UINT64); - doc_parser.doc.tape[doc_parser.current_loc++] = value; - } - really_inline void write_double(double value) noexcept { - write_tape(0, internal::tape_type::DOUBLE); - static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); - memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); - // doc.tape[doc.current_loc++] = *((uint64_t *)&d); - } - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); - } -}; // struct number_writer - -struct structural_parser { - structural_iterator structurals; - parser &doc_parser; +struct structural_parser : structural_iterator { + /** Lets you append to the tape */ + tape_writer tape; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc{}; - uint32_t depth; - - really_inline structural_parser( - const uint8_t *buf, - size_t len, - parser &_doc_parser, - uint32_t next_structural = 0 - ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} - - WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { - doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; - doc_parser.containing_scope[depth].count = 0; - write_tape(0, type); // if the document is correct, this gets rewritten later - doc_parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc; + /** Current depth (nested objects and arrays) */ + uint32_t depth{0}; + + // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations + really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) + : structural_iterator(_parser, start_structural_index), + tape{parser.doc->tape.get()}, + current_string_buf_loc{parser.doc->string_buf.get()} { + } + + WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { + parser.containing_scope[depth].tape_index = next_tape_index(); + parser.containing_scope[depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. + parser.ret_address[depth] = continue_state; depth++; - return depth >= doc_parser.max_depth(); + bool exceeded_max_depth = depth >= parser.max_depth(); + if (exceeded_max_depth) { log_error("Exceeded max depth!"); } + return exceeded_max_depth; } - WARN_UNUSED really_inline bool start_document(ret_address continue_state) { - return start_scope(internal::tape_type::ROOT, continue_state); + WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { + log_start_value("document"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address continue_state) { - return start_scope(internal::tape_type::START_OBJECT, continue_state); + WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { + log_start_value("object"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address continue_state) { - return start_scope(internal::tape_type::START_ARRAY, continue_state); + WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { + log_start_value("array"); + return start_scope(continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type type) noexcept { + really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { depth--; - // write our doc.tape location to the header scope + // write our doc->tape location to the header scope // The root scope gets written *at* the previous location. - write_tape(doc_parser.containing_scope[depth].tape_index, type); + tape.append(parser.containing_scope[depth].tape_index, end); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; - const uint32_t count = doc_parser.containing_scope[depth].count; + const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; + const uint32_t count = parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] - doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); + // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] + tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); + } + + really_inline uint32_t next_tape_index() { + return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); } really_inline void end_object() { - end_scope(internal::tape_type::END_OBJECT); + log_end_value("object"); + end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); } really_inline void end_array() { - end_scope(internal::tape_type::END_ARRAY); + log_end_value("array"); + end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); } really_inline void end_document() { - end_scope(internal::tape_type::ROOT); - } - - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + log_end_value("document"); + end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); } // increment_count increments the count of keys in an object or values in an array. @@ -6529,17 +7018,16 @@ struct structural_parser { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - /* we advance the point, accounting for the fact that we have a NULL - * termination */ - write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline bool on_end_string(uint8_t *dst) noexcept { + really_inline void on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -6549,73 +7037,49 @@ struct structural_parser { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; - return true; } - WARN_UNUSED really_inline bool parse_string() { + WARN_UNUSED really_inline bool parse_string(bool key = false) { + log_value(key ? "key" : "string"); uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(structurals.current(), dst); + dst = stringparsing::parse_string(current(), dst); if (dst == nullptr) { + log_error("Invalid escape in string"); return true; } - return !on_end_string(dst); + on_end_string(dst); + return false; } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - number_writer writer{doc_parser}; - return !numberparsing::parse_number(src, found_minus, writer); + log_value("number"); + bool succeeded = numberparsing::parse_number(src, found_minus, tape); + if (!succeeded) { log_error("Invalid number"); } + return !succeeded; } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(structurals.current(), found_minus); - } - - WARN_UNUSED really_inline bool parse_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; - } - - WARN_UNUSED really_inline bool parse_single_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; + return parse_number(current(), found_minus); } - WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { - switch (structurals.current_char()) { + WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { + switch (advance_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': case 'f': case 'n': - FAIL_IF( parse_atom() ); + case 't': + log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); + tape.append(0, internal::tape_type::TRUE_VALUE); + return continue_state; + case 'f': + log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); + tape.append(0, internal::tape_type::FALSE_VALUE); + return continue_state; + case 'n': + log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); + tape.append(0, internal::tape_type::NULL_VALUE); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -6631,40 +7095,27 @@ struct structural_parser { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: + log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { - // the string might not be NULL terminated. - if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } end_document(); + parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); + if (depth != 0) { - return on_error(TAPE_ERROR); + log_error("Unclosed objects or arrays!"); + return parser.error = TAPE_ERROR; } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - - return on_success(SUCCESS); - } - really_inline error_code on_error(error_code new_error_code) noexcept { - doc_parser.error = new_error_code; - return new_error_code; - } - really_inline error_code on_success(error_code success_code) noexcept { - doc_parser.error = success_code; - doc_parser.valid = true; - return success_code; + return SUCCESS; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by doc_parser.init_stage2(), + /* We do not need the next line because this is done by parser.init_stage2(), * pessimistically. - * doc_parser.is_valid = false; + * parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -6672,12 +7123,12 @@ struct structural_parser { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= doc_parser.max_depth()) { - return on_error(DEPTH_ERROR); + if (depth >= parser.max_depth()) { + return parser.error = DEPTH_ERROR; } - switch (structurals.current_char()) { + switch (current_char()) { case '"': - return on_error(STRING_ERROR); + return parser.error = STRING_ERROR; case '0': case '1': case '2': @@ -6689,92 +7140,124 @@ struct structural_parser { case '8': case '9': case '-': - return on_error(NUMBER_ERROR); + return parser.error = NUMBER_ERROR; case 't': - return on_error(T_ATOM_ERROR); + return parser.error = T_ATOM_ERROR; case 'n': - return on_error(N_ATOM_ERROR); + return parser.error = N_ATOM_ERROR; case 'f': - return on_error(F_ATOM_ERROR); + return parser.error = F_ATOM_ERROR; default: - return on_error(TAPE_ERROR); + return parser.error = TAPE_ERROR; } } really_inline void init() { - current_string_buf_loc = doc_parser.doc.string_buf.get(); - doc_parser.current_loc = 0; - doc_parser.valid = false; - doc_parser.error = UNINITIALIZED; + log_start(); + parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { - init(); // sets is_valid to false - if (len > doc_parser.capacity()) { - return CAPACITY; + WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { + // If there are no structurals left, return EMPTY + if (at_end(parser.n_structural_indexes)) { + return parser.error = EMPTY; } - // Advance to the first character as soon as possible - structurals.advance_char(); + + init(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return on_error(DEPTH_ERROR); + return parser.error = DEPTH_ERROR; } return SUCCESS; } - really_inline char advance_char() { - return structurals.advance_char(); + really_inline void log_value(const char *type) { + logger::log_line(*this, "", type, ""); + } + + static really_inline void log_start() { + logger::log_start(); + } + + really_inline void log_start_value(const char *type) { + logger::log_line(*this, "+", type, ""); + if (logger::LOG_ENABLED) { logger::log_depth++; } + } + + really_inline void log_end_value(const char *type) { + if (logger::LOG_ENABLED) { logger::log_depth--; } + logger::log_line(*this, "-", type, ""); } -}; + + really_inline void log_error(const char *error) { + logger::log_line(*this, "", "ERROR", error); + } +}; // struct structural_parser // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -} // namespace stage2 - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { +template +WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { + dom_parser.doc = &doc; static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(buf, len, doc_parser); - error_code result = parser.start(len, addresses.finish); + stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + error_code result = parser.start(addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.structurals.current_char()) { + switch (parser.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); + // Make sure the outer array is closed before continuing; otherwise, there are ways we could get + // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + goto error; + } + } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); + case 't': + parser.log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::TRUE_VALUE); + goto finish; + case 'f': + parser.log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::FALSE_VALUE); + goto finish; + case 'n': + parser.log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::NULL_VALUE); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: + parser.log_error("Document starts with a non-value character"); goto error; } @@ -6785,43 +7268,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string() ); + FAIL_IF( parser.parse_string(true) ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: + parser.log_error("Object does not start with a key"); goto error; } object_key_state: - FAIL_IF( parser.advance_char() != ':' ); - parser.advance_char(); + if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); + if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } + FAIL_IF( parser.parse_string(true) ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: + parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + CONTINUE( parser.parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.advance_char() == ']') { + if (parser.peek_next_char() == ']') { + parser.advance_char(); parser.end_array(); goto scope_end; } @@ -6836,12 +7321,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case ',': parser.increment_count(); - parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: + parser.log_error("Missing comma between array values"); goto error; } @@ -6852,178 +7337,191 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa return parser.error(); } -WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { - error_code code = stage1(buf, len, doc_parser, false); - if (!code) { - code = stage2(buf, len, doc_parser); - } - return code; -} -/* end file src/generic/stage2/structural_parser.h */ -/* begin file src/generic/stage2/streaming_structural_parser.h */ -namespace stage2 { - -struct streaming_structural_parser: structural_parser { - really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} +} // namespace {} +} // namespace stage2 - // override to add streaming - WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { - init(); // sets is_valid to false - // Capacity ain't no thang for streaming, so we don't check it. - // Advance to the first character as soon as possible - advance_char(); - // Push the root scope (there is always at least one scope) - if (start_document(finish_parser)) { - return on_error(DEPTH_ERROR); - } - return SUCCESS; - } +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + error_code result = stage2::parse_structurals(*this, _doc); + if (result) { return result; } - // override to add streaming - WARN_UNUSED really_inline error_code finish() { - if ( structurals.past_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } - end_document(); - if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - bool finished = structurals.at_end(doc_parser.n_structural_indexes); - return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + // If we didn't make it to the end, it's an error + if ( next_structural_index != n_structural_indexes ) { + logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return error = TAPE_ERROR; } -}; -} // namespace stage2 + return SUCCESS; +} /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { - static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); - error_code result = parser.start(len, addresses.finish); - if (result) { return result; } - // - // Read first value - // - switch (parser.structurals.current_char()) { - case '{': - FAIL_IF( parser.start_object(addresses.finish) ); - goto object_begin; - case '[': - FAIL_IF( parser.start_array(addresses.finish) ); - goto array_begin; - case '"': - FAIL_IF( parser.parse_string() ); - goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); - goto finish; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], false); - }) - ); - goto finish; - case '-': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], true); - }) - ); - goto finish; - default: - goto error; - } +WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::parse_structurals(*this, _doc); +} +/* end file src/generic/stage2/tape_writer.h */ -// -// Object parser parsers -// -object_begin: - switch (parser.advance_char()) { - case '"': { - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - } - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } +WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + error_code err = stage1(_buf, _len, false); + if (err) { return err; } + return stage2(_doc); +} -object_key_parser: - FAIL_IF( parser.advance_char() != ':' ); - parser.increment_count(); - parser.advance_char(); - GOTO( parser.parse_value(addresses, addresses.object_continue) ); +} // namespace fallback +} // namespace simdjson +/* end file src/generic/stage2/tape_writer.h */ +#endif +#if SIMDJSON_IMPLEMENTATION_HASWELL +/* begin file src/haswell/implementation.cpp */ +/* haswell/implementation.h already included: #include "haswell/implementation.h" */ +/* begin file src/haswell/dom_parser_implementation.h */ +#ifndef SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H -object_continue: - switch (parser.advance_char()) { - case ',': - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } +/* isadetection.h already included: #include "isadetection.h" */ -scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); +namespace simdjson { +namespace haswell { + +/* begin file src/generic/dom_parser_implementation.h */ +// expectation: sizeof(scope_descriptor) = 64/8. +struct scope_descriptor { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct scope_descriptor + +#ifdef SIMDJSON_USE_COMPUTED_GOTO +typedef void* ret_address_t; +#else +typedef char ret_address_t; +#endif + +class dom_parser_implementation final : public internal::dom_parser_implementation { +public: + /** Tape location of each open { or [ */ + std::unique_ptr containing_scope{}; + /** Return address of each open { or [ */ + std::unique_ptr ret_address{}; + /** Buffer passed to stage 1 */ + const uint8_t *buf{}; + /** Length passed to stage 1 */ + size_t len{0}; + /** Document passed to stage 2 */ + dom::document *doc{}; + /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ + error_code error{UNINITIALIZED}; + + really_inline dom_parser_implementation(); + dom_parser_implementation(const dom_parser_implementation &) = delete; + dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; + + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + WARN_UNUSED error_code check_for_unclosed_array() noexcept; + WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; + WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; + WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; + WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; +}; + +/* begin file src/generic/stage1/allocate.h */ +namespace stage1 { +namespace allocate { // -// Array parser parsers +// Allocates stage 1 internal state and outputs in the parser // -array_begin: - if (parser.advance_char() == ']') { - parser.end_array(); - goto scope_end; - } - parser.increment_count(); +really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { + size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; + parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); + if (!parser.structural_indexes) { return MEMALLOC; } + parser.structural_indexes[0] = 0; + parser.n_structural_indexes = 0; + return SUCCESS; +} -main_array_switch: - /* we call update char on all paths in, so we can peek at parser.c on the - * on paths that can accept a close square brace (post-, and at start) */ - GOTO( parser.parse_value(addresses, addresses.array_continue) ); +} // namespace allocate +} // namespace stage1 +/* end file src/generic/stage1/allocate.h */ +/* begin file src/generic/stage2/allocate.h */ +namespace stage2 { +namespace allocate { -array_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - parser.advance_char(); - goto main_array_switch; - case ']': - parser.end_array(); - goto scope_end; - default: - goto error; +// +// Allocates stage 2 internal state and outputs in the parser +// +really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { + parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); + parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); + + if (!parser.ret_address || !parser.containing_scope) { + return MEMALLOC; } + return SUCCESS; +} -finish: - next_json = parser.structurals.next_structural_index(); - return parser.finish(); +} // namespace allocate +} // namespace stage2 +/* end file src/generic/stage2/allocate.h */ -error: - return parser.error(); +really_inline dom_parser_implementation::dom_parser_implementation() {} + +// Leaving these here so they can be inlined if so desired +WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { + error_code err = stage1::allocate::set_capacity(*this, capacity); + if (err) { _capacity = 0; return err; } + _capacity = capacity; + return SUCCESS; } -/* end file src/generic/stage2/streaming_structural_parser.h */ -} // namespace fallback +WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { + error_code err = stage2::allocate::set_max_depth(*this, max_depth); + if (err) { _max_depth = 0; return err; } + _max_depth = max_depth; + return SUCCESS; +} +/* end file src/generic/stage2/allocate.h */ + +} // namespace haswell } // namespace simdjson -/* end file src/generic/stage2/streaming_structural_parser.h */ -#endif -#if SIMDJSON_IMPLEMENTATION_HASWELL -/* begin file src/haswell/stage1.cpp */ +#endif // SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H +/* end file src/generic/stage2/allocate.h */ + +TARGET_HASWELL + +namespace simdjson { +namespace haswell { + +WARN_UNUSED error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + dst->set_capacity(capacity); + dst->set_max_depth(max_depth); + return SUCCESS; +} + +} // namespace haswell +} // namespace simdjson + +UNTARGET_REGION +/* end file src/generic/stage2/allocate.h */ +/* begin file src/haswell/dom_parser_implementation.cpp */ +/* haswell/implementation.h already included: #include "haswell/implementation.h" */ +/* haswell/dom_parser_implementation.h already included: #include "haswell/dom_parser_implementation.h" */ + +// +// Stage 1 +// /* begin file src/haswell/bitmask.h */ #ifndef SIMDJSON_HASWELL_BITMASK_H #define SIMDJSON_HASWELL_BITMASK_H @@ -7568,7 +8066,6 @@ UNTARGET_REGION #endif // SIMDJSON_HASWELL_SIMD_H /* end file src/haswell/bitmanipulation.h */ /* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */ -/* haswell/implementation.h already included: #include "haswell/implementation.h" */ TARGET_HASWELL namespace simdjson { @@ -7627,24 +8124,21 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - really_inline size_t block_index() { return idx; } - really_inline bool has_full_block() const { - return idx < lenminusstep; - } - really_inline const uint8_t *full_block() const { - return &buf[idx]; - } - really_inline bool has_remainder() const { - return idx < len; - } - really_inline void get_remainder(uint8_t *tmp_buf) const { - memset(tmp_buf, 0x20, STEP_SIZE); - memcpy(tmp_buf, buf + idx, len - idx); - } - really_inline void advance() { - idx += STEP_SIZE; - } + really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + really_inline size_t block_index(); + really_inline bool has_full_block() const; + really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + really_inline size_t get_remainder(uint8_t *dst) const; + really_inline void advance(); private: const uint8_t *buf; const size_t len; @@ -7652,6 +8146,18 @@ struct buf_block_reader { size_t idx; }; +constexpr const int TITLE_SIZE = 12; + +// Routines to print masks and text for debugging bitmask operations +UNUSED static char * format_input_text_64(const uint8_t *text) { + static char *buf = (char*)malloc(sizeof(simd8x64) + 1); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -7671,6 +8177,34 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } + +template +really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +really_inline size_t buf_block_reader::block_index() { return idx; } + +template +really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -7970,13 +8504,15 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); + + // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); + // Index the last (remainder) block, padded with spaces + uint8_t block[STEP_SIZE]; + if (likely(reader.get_remainder(block)) > 0) { minifier.step(block, reader); } @@ -7989,6 +8525,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); } +/* begin file src/generic/stage1/find_next_document_index.h */ +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { + // TODO don't count separately, just figure out depth + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + return 0; +} + +// Skip the last character if it is partial +really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} +/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -8039,9 +8663,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -8056,6 +8680,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui using namespace simd; namespace utf8_validation { + // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -8100,7 +8725,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -8131,214 +8756,6 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } - // - // Validate the length of multibyte characters (that each multibyte character has the right number - // of continuation characters, and that all continuation characters are part of a multibyte - // character). - // - // Algorithm - // ========= - // - // This algorithm compares *expected* continuation characters with *actual* continuation bytes, - // and emits an error anytime there is a mismatch. - // - // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte - // characters, the file will look like this: - // - // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | - // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| - // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | - // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | - // | is_second_byte | | X | | | | X | | | X | | | - // | is_third_byte | | | X | | | | X | | | | | - // | is_fourth_byte | | | | X | | | | | | | | - // | expected_continuation | | X | X | X | | X | X | | X | | | - // | is_continuation | | X | X | X | | X | X | | X | | | - // - // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): - // - // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not - // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just - // floating around extra outside of any character, or that there is an illegal 5-byte character, - // or maybe it's at the beginning of the file before any characters have started; but it's an - // error in all these cases. - // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means - // we started a new character before we were finished with the current one. - // - // Getting the Previous Bytes - // -------------------------- - // - // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte - // character, we need to "shift the bytes" to find that out. This is what they mean: - // - // - `is_continuation`: if the current byte is a continuation. - // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. - // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. - // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. - // - // We use shuffles to go n bytes back, selecting part of the current `input` and part of the - // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller - // function, because the 1-byte-back data is used by other checks as well. - // - // Getting the Continuation Mask - // ----------------------------- - // - // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as - // numbers, using signed `<` and `>` operations to check if they are continuations or leads. - // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because - // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). - // - // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," - // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. - // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. - // - // When treated as signed numbers, they look like this: - // - // | Type | High Bits | Binary Range | Signed | - // |--------------|------------|--------------|--------| - // | ASCII | `0` | `01111111` | 127 | - // | | | `00000000` | 0 | - // | 4+-Byte Lead | `1111` | `11111111` | -1 | - // | | | `11110000 | -16 | - // | 3-Byte Lead | `1110` | `11101111` | -17 | - // | | | `11100000 | -32 | - // | 2-Byte Lead | `110` | `11011111` | -33 | - // | | | `11000000 | -64 | - // | Continuation | `10` | `10111111` | -65 | - // | | | `10000000 | -128 | - // - // This makes it pretty easy to get the continuation mask! It's just a single comparison: - // - // ``` - // is_continuation = input < -64` - // ``` - // - // We can do something similar for the others, but it takes two comparisons instead of one: "is - // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and - // `> -64`. Surely we can do better, they're right next to each other! - // - // Getting the is_xxx Masks: Shifting the Range - // -------------------------------------------- - // - // Notice *why* continuations were a single comparison. The actual *range* would require two - // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get - // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be - // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. - // - // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps - // ASCII down into the negative, and puts 4+-Byte Lead at the top: - // - // | Type | High Bits | Binary Range | Signed | - // |----------------------|------------|--------------|-------| - // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | - // | | | `01110000 | 112 | - // |----------------------|------------|--------------|-------| - // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | - // | | | `01100000 | 96 | - // |----------------------|------------|--------------|-------| - // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | - // | | | `01000000 | 64 | - // |----------------------|------------|--------------|-------| - // | Continuation (+ 127) | `00` | `00111111` | 63 | - // | | | `00000000 | 0 | - // |----------------------|------------|--------------|-------| - // | ASCII (+ 127) | `1` | `11111111` | -1 | - // | | | `10000000` | -128 | - // |----------------------|------------|--------------|-------| - // - // *Now* we can use signed `>` on all of them: - // - // ``` - // prev1 = input.prev<1> - // prev2 = input.prev<2> - // prev3 = input.prev<3> - // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` - // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` - // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` - // is_second_byte = prev1_flipped > 63; // 2+-byte lead - // is_third_byte = prev2_flipped > 95; // 3+-byte lead - // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead - // ``` - // - // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number - // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 - // `^`'s at a time on Haswell, but only 2 `+`'s). - // - // That doesn't look like it saved us any instructions, did it? Well, because we're adding the - // same number to all of them, we can save one of those `+ 128` operations by assembling - // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 - // to it. One more instruction saved! - // - // ``` - // prev1 = input.prev<1> - // prev3 = input.prev<3> - // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` - // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` - // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or - // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can - // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and - // then adds the result together. Same number of operations, but if the processor can run - // independent things in parallel (which most can), it runs faster. - // - // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have - // a super nice advantage in that more of them can be run at the same time (they can run on 3 - // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, - // saving us the cycle we would have earned by using +. Even more, using an instruction with a - // wider array of ports can help *other* code run ahead, too, since these instructions can "get - // out of the way," running on a port other instructions can't. - // - // Epilogue II: One More Trick - // --------------------------- - // - // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay - // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in - // check_special_cases()--but we'll talk about that there :) - // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -8476,16 +8893,22 @@ class bit_indexer { class json_structural_indexer { public: + /** + * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. + * + * @param partial Setting the partial parameter to true allows the find_structural_bits to + * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If + * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. + */ template - static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes) - : indexer{structural_indexes} {} + really_inline json_structural_indexer(uint32_t *structural_indexes); template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); + really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); json_scanner scanner{}; utf8_checker checker{}; @@ -8494,65 +8917,8 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} - -really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); - - error_code error = scanner.finish(streaming); - if (unlikely(error != SUCCESS)) { return error; } - - if (unescaped_chars_error) { - return UNESCAPED_CHARS; - } - - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /* a valid JSON file cannot have zero structural indexes - we should have - * found something */ - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } - if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { - /* the string might not be NULL terminated, but we add a virtual NULL - * ending character. */ - parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); - } - /* make it safe to dereference one beyond this array */ - parser.structural_indexes[parser.n_structural_indexes] = 0; - return checker.errors(); -} - -template<> -really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { - simd::simd8x64 in_1(block); - simd::simd8x64 in_2(block+64); - json_block block_1 = scanner.next(in_1); - json_block block_2 = scanner.next(in_2); - this->next(in_1, block_1, reader.block_index()); - this->next(in_2, block_2, reader.block_index()+64); - reader.advance(); -} - -template<> -really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { - simd::simd8x64 in_1(block); - json_block block_1 = scanner.next(in_1); - this->next(in_1, block_1, reader.block_index()); - reader.advance(); -} +really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} -// -// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. // // PERF NOTES: // We pipe 2 inputs through these stages: @@ -8570,41 +8936,116 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough // workout. // -// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. -// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, -// you may want to call on a function like trimmed_length_safe_utf8. template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { if (unlikely(len > parser.capacity())) { return CAPACITY; } + if (partial) { len = trim_partial_utf8(buf, len); } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); + + // Read all but the last block while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); - indexer.step(block, reader); - } - - return indexer.finish(parser, reader.block_index(), len, streaming); -} + // Take care of the last block (will always be there unless file is empty) + uint8_t block[STEP_SIZE]; + if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + indexer.step(block, reader); -} // namespace stage1 -/* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { - return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming); + return indexer.finish(parser, reader.block_index(), len, partial); } -} // namespace haswell +template<> +really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { + simd::simd8x64 in_1(block); + simd::simd8x64 in_2(block+64); + json_block block_1 = scanner.next(in_1); + json_block block_2 = scanner.next(in_2); + this->next(in_1, block_1, reader.block_index()); + this->next(in_2, block_2, reader.block_index()+64); + reader.advance(); +} + +template<> +really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { + simd::simd8x64 in_1(block); + json_block block_1 = scanner.next(in_1); + this->next(in_1, block_1, reader.block_index()); + reader.advance(); +} +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} + +really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); + + error_code error = scanner.finish(partial); + if (unlikely(error != SUCCESS)) { return error; } + + if (unescaped_chars_error) { + return UNESCAPED_CHARS; + } + + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /*** + * This is related to https://github.com/simdjson/simdjson/issues/906 + * Basically, we want to make sure that if the parsing continues beyond the last (valid) + * structural character, it quickly stops. + * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. + * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing + * continues, then it must be [,] or }. + * Suppose it is ] or }. We backtrack to the first character, what could it be that would + * not trigger an error? It could be ] or } but no, because you can't start a document that way. + * It can't be a comma, a colon or any simple value. So the only way we could continue is + * if the repeated character is [. But if so, the document must start with [. But if the document + * starts with [, it should end with ]. If we enforce that rule, then we would get + * ][[ which is invalid. + **/ + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 2] = 0; + parser.next_structural_index = 0; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (partial) { + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + } + parser.n_structural_indexes = new_structural_indexes; + } + return checker.errors(); +} + +} // namespace stage1 +/* end file src/generic/stage1/json_structural_indexer.h */ +WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { + this->buf = _buf; + this->len = _len; + return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); +} + +} // namespace haswell } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage1/json_structural_indexer.h */ -/* begin file src/haswell/stage2.cpp */ -/* haswell/implementation.h already included: #include "haswell/implementation.h" */ + +// +// Stage 2 +// /* begin file src/haswell/stringparsing.h */ #ifndef SIMDJSON_HASWELL_STRINGPARSING_H #define SIMDJSON_HASWELL_STRINGPARSING_H @@ -9015,10 +9456,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 × + // can't fit in binary64. The maximal value is about 1.7976931348623157 x // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe.  The estimate for the total number + // are about 10^80 atoms in the universe. The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -9038,26 +9479,6 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } -// We need to check that the character following a zero is valid. This is -// probably frequent and it is harder than it looks. We are building all of this -// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... -const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - -really_inline bool -is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { - return structural_or_whitespace_or_exponent_or_decimal_negated[c]; -} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -9135,14 +9556,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -9155,12 +9576,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.write_s64(i); + writer.append_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.write_u64(i); + writer.append_u64(i); } } return is_structural_or_whitespace(*p); @@ -9170,7 +9591,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -9194,10 +9615,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W writer) { + W &writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.write_s64(0); // always write zero + writer.append_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -9217,7 +9638,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { + if (is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -9341,7 +9762,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (digit_count >= 19) { + if (unlikely(digit_count >= 19)) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -9349,14 +9770,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_double(); + return success; } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer when we passed it to the + // slow_float_parsing() function, so we have to skip those tape spots now that we've returned + writer.skip_double(); + return success; } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -9365,7 +9794,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -9380,10 +9809,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - return parse_large_integer(src, writer, found_minus); + bool success = parse_large_integer(src, writer, found_minus); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_large_integer(); + return success; } i = negative ? 0 - i : i; - writer.write_s64(i); + writer.append_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -9408,6 +9841,72 @@ TARGET_HASWELL namespace simdjson { namespace haswell { +/* begin file src/generic/stage2/logger.h */ +// This is for an internal-only stage 2 specific logger. +// Set LOG_ENABLED = true to log what stage 2 is doing! +namespace logger { + static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; + + static constexpr const bool LOG_ENABLED = false; + static constexpr const int LOG_EVENT_LEN = 30; + static constexpr const int LOG_BUFFER_LEN = 20; + static constexpr const int LOG_DETAIL_LEN = 50; + static constexpr const int LOG_INDEX_LEN = 10; + + static int log_depth; // Not threadsafe. Log only. + + // Helper to turn unprintable or newline characters into spaces + static really_inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } + } + + // Print the header and set up log_start + static really_inline void log_start() { + if (LOG_ENABLED) { + log_depth = 0; + printf("\n"); + printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); + printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); + } + } + + static really_inline void log_string(const char *message) { + if (LOG_ENABLED) { + printf("%s\n", message); + } + } + + // Logs a single line of + template + static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { + if (LOG_ENABLED) { + printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + { + // Print the next N characters in the buffer. + printf("| "); + // Otherwise, print the characters starting from the buffer position. + // Print spaces for unprintable or newline characters. + for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -9502,32 +10009,25 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), idx); + memcpy(copy, buf, parser.len); + memset(copy + parser.len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), *current_structural); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return next_structural+1 > n_structural_indexes; + return current_structural >= &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_end(uint32_t n_structural_indexes) { - return next_structural+1 == n_structural_indexes; + return current_structural == &parser.structural_indexes[n_structural_indexes]; } - really_inline size_t next_structural_index() { - return next_structural; + really_inline bool at_beginning() { + return current_structural == parser.structural_indexes.get(); } - - const uint8_t* const buf; - const size_t len; - const uint32_t* const structural_indexes; - size_t next_structural; // next structural index - size_t idx{0}; // location of the structural character in the input (buf) - uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -9539,8 +10039,105 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { +namespace { // Make everything here private + +/* begin file src/generic/stage2/tape_writer.h */ +struct tape_writer { + /** The next place to write to tape */ + uint64_t *next_tape_loc; + + /** Write a signed 64-bit value to tape. */ + really_inline void append_s64(int64_t value) noexcept; + + /** Write an unsigned 64-bit value to tape. */ + really_inline void append_u64(uint64_t value) noexcept; + + /** Write a double value to tape. */ + really_inline void append_double(double value) noexcept; + + /** + * Append a tape entry (an 8-bit type,and 56 bits worth of value). + */ + really_inline void append(uint64_t val, internal::tape_type t) noexcept; + + /** + * Skip the current tape entry without writing. + * + * Used to skip the start of the container, since we'll come back later to fill it in when the + * container ends. + */ + really_inline void skip() noexcept; + + /** + * Skip the number of tape entries necessary to write a large u64 or i64. + */ + really_inline void skip_large_integer() noexcept; + + /** + * Skip the number of tape entries necessary to write a double. + */ + really_inline void skip_double() noexcept; + + /** + * Write a value to a known location on tape. + * + * Used to go back and write out the start of a container after the container ends. + */ + really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; + +private: + /** + * Append both the tape entry, and a supplementary value following it. Used for types that need + * all 64 bits, such as double and uint64_t. + */ + template + really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; +}; // struct number_writer + +really_inline void tape_writer::append_s64(int64_t value) noexcept { + append2(0, value, internal::tape_type::INT64); +} + +really_inline void tape_writer::append_u64(uint64_t value) noexcept { + append(0, internal::tape_type::UINT64); + *next_tape_loc = value; + next_tape_loc++; +} + +/** Write a double value to tape. */ +really_inline void tape_writer::append_double(double value) noexcept { + append2(0, value, internal::tape_type::DOUBLE); +} + +really_inline void tape_writer::skip() noexcept { + next_tape_loc++; +} + +really_inline void tape_writer::skip_large_integer() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::skip_double() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { + *next_tape_loc = val | ((uint64_t(char(t))) << 56); + next_tape_loc++; +} + +template +really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { + append(val, t); + static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); + memcpy(next_tape_loc, &val2, sizeof(val2)); + next_tape_loc++; +} -using internal::ret_address; +really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { + tape_loc = val | ((uint64_t(char(t))) << 56); +} +/* end file src/generic/stage2/tape_writer.h */ #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -9571,102 +10168,88 @@ using internal::ret_address; #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address array_begin; - ret_address array_continue; - ret_address error; - ret_address finish; - ret_address object_begin; - ret_address object_continue; + ret_address_t array_begin; + ret_address_t array_continue; + ret_address_t error; + ret_address_t finish; + ret_address_t object_begin; + ret_address_t object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct number_writer { - parser &doc_parser; - - really_inline void write_s64(int64_t value) noexcept { - write_tape(0, internal::tape_type::INT64); - std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); - ++doc_parser.current_loc; - } - really_inline void write_u64(uint64_t value) noexcept { - write_tape(0, internal::tape_type::UINT64); - doc_parser.doc.tape[doc_parser.current_loc++] = value; - } - really_inline void write_double(double value) noexcept { - write_tape(0, internal::tape_type::DOUBLE); - static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); - memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); - // doc.tape[doc.current_loc++] = *((uint64_t *)&d); - } - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); - } -}; // struct number_writer - -struct structural_parser { - structural_iterator structurals; - parser &doc_parser; +struct structural_parser : structural_iterator { + /** Lets you append to the tape */ + tape_writer tape; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc{}; - uint32_t depth; - - really_inline structural_parser( - const uint8_t *buf, - size_t len, - parser &_doc_parser, - uint32_t next_structural = 0 - ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} - - WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { - doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; - doc_parser.containing_scope[depth].count = 0; - write_tape(0, type); // if the document is correct, this gets rewritten later - doc_parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc; + /** Current depth (nested objects and arrays) */ + uint32_t depth{0}; + + // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations + really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) + : structural_iterator(_parser, start_structural_index), + tape{parser.doc->tape.get()}, + current_string_buf_loc{parser.doc->string_buf.get()} { + } + + WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { + parser.containing_scope[depth].tape_index = next_tape_index(); + parser.containing_scope[depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. + parser.ret_address[depth] = continue_state; depth++; - return depth >= doc_parser.max_depth(); + bool exceeded_max_depth = depth >= parser.max_depth(); + if (exceeded_max_depth) { log_error("Exceeded max depth!"); } + return exceeded_max_depth; } - WARN_UNUSED really_inline bool start_document(ret_address continue_state) { - return start_scope(internal::tape_type::ROOT, continue_state); + WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { + log_start_value("document"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address continue_state) { - return start_scope(internal::tape_type::START_OBJECT, continue_state); + WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { + log_start_value("object"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address continue_state) { - return start_scope(internal::tape_type::START_ARRAY, continue_state); + WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { + log_start_value("array"); + return start_scope(continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type type) noexcept { + really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { depth--; - // write our doc.tape location to the header scope + // write our doc->tape location to the header scope // The root scope gets written *at* the previous location. - write_tape(doc_parser.containing_scope[depth].tape_index, type); + tape.append(parser.containing_scope[depth].tape_index, end); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; - const uint32_t count = doc_parser.containing_scope[depth].count; + const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; + const uint32_t count = parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] - doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); + // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] + tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); + } + + really_inline uint32_t next_tape_index() { + return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); } really_inline void end_object() { - end_scope(internal::tape_type::END_OBJECT); + log_end_value("object"); + end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); } really_inline void end_array() { - end_scope(internal::tape_type::END_ARRAY); + log_end_value("array"); + end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); } really_inline void end_document() { - end_scope(internal::tape_type::ROOT); - } - - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + log_end_value("document"); + end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); } // increment_count increments the count of keys in an object or values in an array. @@ -9674,17 +10257,16 @@ struct structural_parser { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - /* we advance the point, accounting for the fact that we have a NULL - * termination */ - write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline bool on_end_string(uint8_t *dst) noexcept { + really_inline void on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -9694,73 +10276,49 @@ struct structural_parser { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; - return true; } - WARN_UNUSED really_inline bool parse_string() { + WARN_UNUSED really_inline bool parse_string(bool key = false) { + log_value(key ? "key" : "string"); uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(structurals.current(), dst); + dst = stringparsing::parse_string(current(), dst); if (dst == nullptr) { + log_error("Invalid escape in string"); return true; } - return !on_end_string(dst); + on_end_string(dst); + return false; } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - number_writer writer{doc_parser}; - return !numberparsing::parse_number(src, found_minus, writer); + log_value("number"); + bool succeeded = numberparsing::parse_number(src, found_minus, tape); + if (!succeeded) { log_error("Invalid number"); } + return !succeeded; } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(structurals.current(), found_minus); + return parse_number(current(), found_minus); } - WARN_UNUSED really_inline bool parse_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; - } - - WARN_UNUSED really_inline bool parse_single_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; - } - - WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { - switch (structurals.current_char()) { + WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { + switch (advance_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': case 'f': case 'n': - FAIL_IF( parse_atom() ); + case 't': + log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); + tape.append(0, internal::tape_type::TRUE_VALUE); + return continue_state; + case 'f': + log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); + tape.append(0, internal::tape_type::FALSE_VALUE); + return continue_state; + case 'n': + log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); + tape.append(0, internal::tape_type::NULL_VALUE); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -9776,40 +10334,27 @@ struct structural_parser { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: + log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { - // the string might not be NULL terminated. - if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } end_document(); + parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); + if (depth != 0) { - return on_error(TAPE_ERROR); + log_error("Unclosed objects or arrays!"); + return parser.error = TAPE_ERROR; } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - - return on_success(SUCCESS); - } - really_inline error_code on_error(error_code new_error_code) noexcept { - doc_parser.error = new_error_code; - return new_error_code; - } - really_inline error_code on_success(error_code success_code) noexcept { - doc_parser.error = success_code; - doc_parser.valid = true; - return success_code; + return SUCCESS; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by doc_parser.init_stage2(), + /* We do not need the next line because this is done by parser.init_stage2(), * pessimistically. - * doc_parser.is_valid = false; + * parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -9817,12 +10362,12 @@ struct structural_parser { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= doc_parser.max_depth()) { - return on_error(DEPTH_ERROR); + if (depth >= parser.max_depth()) { + return parser.error = DEPTH_ERROR; } - switch (structurals.current_char()) { + switch (current_char()) { case '"': - return on_error(STRING_ERROR); + return parser.error = STRING_ERROR; case '0': case '1': case '2': @@ -9834,302 +10379,173 @@ struct structural_parser { case '8': case '9': case '-': - return on_error(NUMBER_ERROR); + return parser.error = NUMBER_ERROR; case 't': - return on_error(T_ATOM_ERROR); + return parser.error = T_ATOM_ERROR; case 'n': - return on_error(N_ATOM_ERROR); + return parser.error = N_ATOM_ERROR; case 'f': - return on_error(F_ATOM_ERROR); + return parser.error = F_ATOM_ERROR; default: - return on_error(TAPE_ERROR); + return parser.error = TAPE_ERROR; } } really_inline void init() { - current_string_buf_loc = doc_parser.doc.string_buf.get(); - doc_parser.current_loc = 0; - doc_parser.valid = false; - doc_parser.error = UNINITIALIZED; + log_start(); + parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { - init(); // sets is_valid to false - if (len > doc_parser.capacity()) { - return CAPACITY; + WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { + // If there are no structurals left, return EMPTY + if (at_end(parser.n_structural_indexes)) { + return parser.error = EMPTY; } - // Advance to the first character as soon as possible - structurals.advance_char(); + + init(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return on_error(DEPTH_ERROR); + return parser.error = DEPTH_ERROR; } return SUCCESS; } - really_inline char advance_char() { - return structurals.advance_char(); - } -}; - -// Redefine FAIL_IF to use goto since it'll be used inside the function now -#undef FAIL_IF -#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } - -} // namespace stage2 - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { - static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(buf, len, doc_parser); - error_code result = parser.start(len, addresses.finish); - if (result) { return result; } - - // - // Read first value - // - switch (parser.structurals.current_char()) { - case '{': - FAIL_IF( parser.start_object(addresses.finish) ); - goto object_begin; - case '[': - FAIL_IF( parser.start_array(addresses.finish) ); - goto array_begin; - case '"': - FAIL_IF( parser.parse_string() ); - goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); - goto finish; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], false); - }) - ); - goto finish; - case '-': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], true); - }) - ); - goto finish; - default: - goto error; - } - -// -// Object parser states -// -object_begin: - switch (parser.advance_char()) { - case '"': { - parser.increment_count(); - FAIL_IF( parser.parse_string() ); - goto object_key_state; - } - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -object_key_state: - FAIL_IF( parser.advance_char() != ':' ); - parser.advance_char(); - GOTO( parser.parse_value(addresses, addresses.object_continue) ); - -object_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); - goto object_key_state; - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); - -// -// Array parser states -// -array_begin: - if (parser.advance_char() == ']') { - parser.end_array(); - goto scope_end; + really_inline void log_value(const char *type) { + logger::log_line(*this, "", type, ""); } - parser.increment_count(); -main_array_switch: - /* we call update char on all paths in, so we can peek at parser.c on the - * on paths that can accept a close square brace (post-, and at start) */ - GOTO( parser.parse_value(addresses, addresses.array_continue) ); - -array_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - parser.advance_char(); - goto main_array_switch; - case ']': - parser.end_array(); - goto scope_end; - default: - goto error; + static really_inline void log_start() { + logger::log_start(); } -finish: - return parser.finish(); - -error: - return parser.error(); -} - -WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { - error_code code = stage1(buf, len, doc_parser, false); - if (!code) { - code = stage2(buf, len, doc_parser); + really_inline void log_start_value(const char *type) { + logger::log_line(*this, "+", type, ""); + if (logger::LOG_ENABLED) { logger::log_depth++; } } - return code; -} -/* end file src/generic/stage2/structural_parser.h */ -/* begin file src/generic/stage2/streaming_structural_parser.h */ -namespace stage2 { - -struct streaming_structural_parser: structural_parser { - really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} - // override to add streaming - WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { - init(); // sets is_valid to false - // Capacity ain't no thang for streaming, so we don't check it. - // Advance to the first character as soon as possible - advance_char(); - // Push the root scope (there is always at least one scope) - if (start_document(finish_parser)) { - return on_error(DEPTH_ERROR); - } - return SUCCESS; + really_inline void log_end_value(const char *type) { + if (logger::LOG_ENABLED) { logger::log_depth--; } + logger::log_line(*this, "-", type, ""); } - // override to add streaming - WARN_UNUSED really_inline error_code finish() { - if ( structurals.past_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } - end_document(); - if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - bool finished = structurals.at_end(doc_parser.n_structural_indexes); - return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + really_inline void log_error(const char *error) { + logger::log_line(*this, "", "ERROR", error); } -}; +}; // struct structural_parser -} // namespace stage2 +// Redefine FAIL_IF to use goto since it'll be used inside the function now +#undef FAIL_IF +#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { +template +WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { + dom_parser.doc = &doc; static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); - error_code result = parser.start(len, addresses.finish); + stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + error_code result = parser.start(addresses.finish); if (result) { return result; } + // // Read first value // - switch (parser.structurals.current_char()) { + switch (parser.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); + // Make sure the outer array is closed before continuing; otherwise, there are ways we could get + // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + goto error; + } + } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); + case 't': + parser.log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::TRUE_VALUE); + goto finish; + case 'f': + parser.log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::FALSE_VALUE); + goto finish; + case 'n': + parser.log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::NULL_VALUE); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: + parser.log_error("Document starts with a non-value character"); goto error; } // -// Object parser parsers +// Object parser states // object_begin: switch (parser.advance_char()) { case '"': { - FAIL_IF( parser.parse_string() ); - goto object_key_parser; + parser.increment_count(); + FAIL_IF( parser.parse_string(true) ); + goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: + parser.log_error("Object does not start with a key"); goto error; } -object_key_parser: - FAIL_IF( parser.advance_char() != ':' ); - parser.increment_count(); - parser.advance_char(); +object_key_state: + if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); - goto object_key_parser; + parser.increment_count(); + if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } + FAIL_IF( parser.parse_string(true) ); + goto object_key_state; case '}': parser.end_object(); goto scope_end; default: + parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + CONTINUE( parser.parser.ret_address[parser.depth] ); // -// Array parser parsers +// Array parser states // array_begin: - if (parser.advance_char() == ']') { + if (parser.peek_next_char() == ']') { + parser.advance_char(); parser.end_array(); goto scope_end; } @@ -10144,31 +10560,208 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case ',': parser.increment_count(); - parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: + parser.log_error("Missing comma between array values"); goto error; } finish: - next_json = parser.structurals.next_structural_index(); return parser.finish(); error: return parser.error(); } -/* end file src/generic/stage2/streaming_structural_parser.h */ + +} // namespace {} +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + error_code result = stage2::parse_structurals(*this, _doc); + if (result) { return result; } + + // If we didn't make it to the end, it's an error + if ( next_structural_index != n_structural_indexes ) { + logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return error = TAPE_ERROR; + } + + return SUCCESS; +} + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::parse_structurals(*this, _doc); +} +/* end file src/generic/stage2/tape_writer.h */ + +WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + error_code err = stage1(_buf, _len, false); + if (err) { return err; } + return stage2(_doc); +} } // namespace haswell } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage2/streaming_structural_parser.h */ +/* end file src/generic/stage2/tape_writer.h */ #endif #if SIMDJSON_IMPLEMENTATION_WESTMERE -/* begin file src/westmere/stage1.cpp */ +/* begin file src/westmere/implementation.cpp */ +/* westmere/implementation.h already included: #include "westmere/implementation.h" */ +/* begin file src/westmere/dom_parser_implementation.h */ +#ifndef SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H + +/* isadetection.h already included: #include "isadetection.h" */ + +namespace simdjson { +namespace westmere { + +/* begin file src/generic/dom_parser_implementation.h */ +// expectation: sizeof(scope_descriptor) = 64/8. +struct scope_descriptor { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct scope_descriptor + +#ifdef SIMDJSON_USE_COMPUTED_GOTO +typedef void* ret_address_t; +#else +typedef char ret_address_t; +#endif + +class dom_parser_implementation final : public internal::dom_parser_implementation { +public: + /** Tape location of each open { or [ */ + std::unique_ptr containing_scope{}; + /** Return address of each open { or [ */ + std::unique_ptr ret_address{}; + /** Buffer passed to stage 1 */ + const uint8_t *buf{}; + /** Length passed to stage 1 */ + size_t len{0}; + /** Document passed to stage 2 */ + dom::document *doc{}; + /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ + error_code error{UNINITIALIZED}; + + really_inline dom_parser_implementation(); + dom_parser_implementation(const dom_parser_implementation &) = delete; + dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; + + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; + WARN_UNUSED error_code check_for_unclosed_array() noexcept; + WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; + WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; + WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; + WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; +}; + +/* begin file src/generic/stage1/allocate.h */ +namespace stage1 { +namespace allocate { + +// +// Allocates stage 1 internal state and outputs in the parser +// +really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { + size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; + parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); + if (!parser.structural_indexes) { return MEMALLOC; } + parser.structural_indexes[0] = 0; + parser.n_structural_indexes = 0; + return SUCCESS; +} + +} // namespace allocate +} // namespace stage1 +/* end file src/generic/stage1/allocate.h */ +/* begin file src/generic/stage2/allocate.h */ +namespace stage2 { +namespace allocate { + +// +// Allocates stage 2 internal state and outputs in the parser +// +really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { + parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); + parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); + + if (!parser.ret_address || !parser.containing_scope) { + return MEMALLOC; + } + return SUCCESS; +} + +} // namespace allocate +} // namespace stage2 +/* end file src/generic/stage2/allocate.h */ + +really_inline dom_parser_implementation::dom_parser_implementation() {} + +// Leaving these here so they can be inlined if so desired +WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { + error_code err = stage1::allocate::set_capacity(*this, capacity); + if (err) { _capacity = 0; return err; } + _capacity = capacity; + return SUCCESS; +} + +WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { + error_code err = stage2::allocate::set_max_depth(*this, max_depth); + if (err) { _max_depth = 0; return err; } + _max_depth = max_depth; + return SUCCESS; +} +/* end file src/generic/stage2/allocate.h */ + +} // namespace westmere +} // namespace simdjson + +#endif // SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H +/* end file src/generic/stage2/allocate.h */ + +TARGET_HASWELL + +namespace simdjson { +namespace westmere { + +WARN_UNUSED error_code implementation::create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr& dst +) const noexcept { + dst.reset( new (std::nothrow) dom_parser_implementation() ); + if (!dst) { return MEMALLOC; } + dst->set_capacity(capacity); + dst->set_max_depth(max_depth); + return SUCCESS; +} + +} // namespace westmere +} // namespace simdjson + +UNTARGET_REGION +/* end file src/generic/stage2/allocate.h */ +/* begin file src/westmere/dom_parser_implementation.cpp */ +/* westmere/implementation.h already included: #include "westmere/implementation.h" */ +/* westmere/dom_parser_implementation.h already included: #include "westmere/dom_parser_implementation.h" */ + +// +// Stage 1 +// /* begin file src/westmere/bitmask.h */ #ifndef SIMDJSON_WESTMERE_BITMASK_H #define SIMDJSON_WESTMERE_BITMASK_H @@ -10739,24 +11332,21 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - really_inline size_t block_index() { return idx; } - really_inline bool has_full_block() const { - return idx < lenminusstep; - } - really_inline const uint8_t *full_block() const { - return &buf[idx]; - } - really_inline bool has_remainder() const { - return idx < len; - } - really_inline void get_remainder(uint8_t *tmp_buf) const { - memset(tmp_buf, 0x20, STEP_SIZE); - memcpy(tmp_buf, buf + idx, len - idx); - } - really_inline void advance() { - idx += STEP_SIZE; - } + really_inline buf_block_reader(const uint8_t *_buf, size_t _len); + really_inline size_t block_index(); + really_inline bool has_full_block() const; + really_inline const uint8_t *full_block() const; + /** + * Get the last block, padded with spaces. + * + * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this + * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there + * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. + * + * @return the number of effective characters in the last block. + */ + really_inline size_t get_remainder(uint8_t *dst) const; + really_inline void advance(); private: const uint8_t *buf; const size_t len; @@ -10764,6 +11354,18 @@ struct buf_block_reader { size_t idx; }; +constexpr const int TITLE_SIZE = 12; + +// Routines to print masks and text for debugging bitmask operations +UNUSED static char * format_input_text_64(const uint8_t *text) { + static char *buf = (char*)malloc(sizeof(simd8x64) + 1); + for (size_t i=0; i); i++) { + buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); + } + buf[sizeof(simd8x64)] = '\0'; + return buf; +} + // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -10783,6 +11385,34 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } + +template +really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + +template +really_inline size_t buf_block_reader::block_index() { return idx; } + +template +really_inline bool buf_block_reader::has_full_block() const { + return idx < lenminusstep; +} + +template +really_inline const uint8_t *buf_block_reader::full_block() const { + return &buf[idx]; +} + +template +really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { + memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. + memcpy(dst, buf + idx, len - idx); + return len - idx; +} + +template +really_inline void buf_block_reader::advance() { + idx += STEP_SIZE; +} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -11082,13 +11712,15 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); + + // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); + // Index the last (remainder) block, padded with spaces + uint8_t block[STEP_SIZE]; + if (likely(reader.get_remainder(block)) > 0) { minifier.step(block, reader); } @@ -11101,6 +11733,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } +/* begin file src/generic/stage1/find_next_document_index.h */ +/** + * This algorithm is used to quickly identify the last structural position that + * makes up a complete document. + * + * It does this by going backwards and finding the last *document boundary* (a + * place where one value follows another without a comma between them). If the + * last document (the characters after the boundary) has an equal number of + * start and end brackets, it is considered complete. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and that means we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete document, therefore the last json buffer location is the end of the + * batch. + */ +really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { + // TODO don't count separately, just figure out depth + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (parser.buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (parser.buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + // Last document is complete, so the next document will appear after! + if (!arr_cnt && !obj_cnt) { + return parser.n_structural_indexes; + } + // Last document is incomplete; mark the document at i + 1 as the next one + return i; + } + return 0; +} + +// Skip the last character if it is partial +really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { + if (unlikely(len < 3)) { + switch (len) { + case 2: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left + return len; + case 1: + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + return len; + case 0: + return len; + } + } + if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left + if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left + if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left + return len; +} +/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -11151,9 +11871,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -11168,6 +11888,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui using namespace simd; namespace utf8_validation { + // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -11212,7 +11933,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -11243,214 +11964,6 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } - // - // Validate the length of multibyte characters (that each multibyte character has the right number - // of continuation characters, and that all continuation characters are part of a multibyte - // character). - // - // Algorithm - // ========= - // - // This algorithm compares *expected* continuation characters with *actual* continuation bytes, - // and emits an error anytime there is a mismatch. - // - // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte - // characters, the file will look like this: - // - // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | - // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| - // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | - // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | - // | is_second_byte | | X | | | | X | | | X | | | - // | is_third_byte | | | X | | | | X | | | | | - // | is_fourth_byte | | | | X | | | | | | | | - // | expected_continuation | | X | X | X | | X | X | | X | | | - // | is_continuation | | X | X | X | | X | X | | X | | | - // - // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): - // - // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not - // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just - // floating around extra outside of any character, or that there is an illegal 5-byte character, - // or maybe it's at the beginning of the file before any characters have started; but it's an - // error in all these cases. - // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means - // we started a new character before we were finished with the current one. - // - // Getting the Previous Bytes - // -------------------------- - // - // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte - // character, we need to "shift the bytes" to find that out. This is what they mean: - // - // - `is_continuation`: if the current byte is a continuation. - // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. - // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. - // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. - // - // We use shuffles to go n bytes back, selecting part of the current `input` and part of the - // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller - // function, because the 1-byte-back data is used by other checks as well. - // - // Getting the Continuation Mask - // ----------------------------- - // - // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as - // numbers, using signed `<` and `>` operations to check if they are continuations or leads. - // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because - // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). - // - // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," - // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. - // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. - // - // When treated as signed numbers, they look like this: - // - // | Type | High Bits | Binary Range | Signed | - // |--------------|------------|--------------|--------| - // | ASCII | `0` | `01111111` | 127 | - // | | | `00000000` | 0 | - // | 4+-Byte Lead | `1111` | `11111111` | -1 | - // | | | `11110000 | -16 | - // | 3-Byte Lead | `1110` | `11101111` | -17 | - // | | | `11100000 | -32 | - // | 2-Byte Lead | `110` | `11011111` | -33 | - // | | | `11000000 | -64 | - // | Continuation | `10` | `10111111` | -65 | - // | | | `10000000 | -128 | - // - // This makes it pretty easy to get the continuation mask! It's just a single comparison: - // - // ``` - // is_continuation = input < -64` - // ``` - // - // We can do something similar for the others, but it takes two comparisons instead of one: "is - // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and - // `> -64`. Surely we can do better, they're right next to each other! - // - // Getting the is_xxx Masks: Shifting the Range - // -------------------------------------------- - // - // Notice *why* continuations were a single comparison. The actual *range* would require two - // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get - // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be - // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. - // - // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps - // ASCII down into the negative, and puts 4+-Byte Lead at the top: - // - // | Type | High Bits | Binary Range | Signed | - // |----------------------|------------|--------------|-------| - // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | - // | | | `01110000 | 112 | - // |----------------------|------------|--------------|-------| - // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | - // | | | `01100000 | 96 | - // |----------------------|------------|--------------|-------| - // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | - // | | | `01000000 | 64 | - // |----------------------|------------|--------------|-------| - // | Continuation (+ 127) | `00` | `00111111` | 63 | - // | | | `00000000 | 0 | - // |----------------------|------------|--------------|-------| - // | ASCII (+ 127) | `1` | `11111111` | -1 | - // | | | `10000000` | -128 | - // |----------------------|------------|--------------|-------| - // - // *Now* we can use signed `>` on all of them: - // - // ``` - // prev1 = input.prev<1> - // prev2 = input.prev<2> - // prev3 = input.prev<3> - // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` - // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` - // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` - // is_second_byte = prev1_flipped > 63; // 2+-byte lead - // is_third_byte = prev2_flipped > 95; // 3+-byte lead - // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead - // ``` - // - // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number - // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 - // `^`'s at a time on Haswell, but only 2 `+`'s). - // - // That doesn't look like it saved us any instructions, did it? Well, because we're adding the - // same number to all of them, we can save one of those `+ 128` operations by assembling - // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 - // to it. One more instruction saved! - // - // ``` - // prev1 = input.prev<1> - // prev3 = input.prev<3> - // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` - // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` - // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or - // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can - // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and - // then adds the result together. Same number of operations, but if the processor can run - // independent things in parallel (which most can), it runs faster. - // - // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have - // a super nice advantage in that more of them can be run at the same time (they can run on 3 - // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, - // saving us the cycle we would have earned by using +. Even more, using an instruction with a - // wider array of ports can help *other* code run ahead, too, since these instructions can "get - // out of the way," running on a port other instructions can't. - // - // Epilogue II: One More Trick - // --------------------------- - // - // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay - // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in - // check_special_cases()--but we'll talk about that there :) - // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -11588,16 +12101,22 @@ class bit_indexer { class json_structural_indexer { public: + /** + * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. + * + * @param partial Setting the partial parameter to true allows the find_structural_bits to + * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If + * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. + */ template - static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; + static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes) - : indexer{structural_indexes} {} + really_inline json_structural_indexer(uint32_t *structural_indexes); template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); + really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); json_scanner scanner{}; utf8_checker checker{}; @@ -11606,42 +12125,44 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} +really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} -really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +template +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { + if (unlikely(len > parser.capacity())) { return CAPACITY; } + if (partial) { len = trim_partial_utf8(buf, len); } - error_code error = scanner.finish(streaming); - if (unlikely(error != SUCCESS)) { return error; } + buf_block_reader reader(buf, len); + json_structural_indexer indexer(parser.structural_indexes.get()); - if (unescaped_chars_error) { - return UNESCAPED_CHARS; + // Read all but the last block + while (reader.has_full_block()) { + indexer.step(reader.full_block(), reader); } - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /* a valid JSON file cannot have zero structural indexes - we should have - * found something */ - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } - if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { - /* the string might not be NULL terminated, but we add a virtual NULL - * ending character. */ - parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); - } - /* make it safe to dereference one beyond this array */ - parser.structural_indexes[parser.n_structural_indexes] = 0; - return checker.errors(); + // Take care of the last block (will always be there unless file is empty) + uint8_t block[STEP_SIZE]; + if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } + indexer.step(block, reader); + + return indexer.finish(parser, reader.block_index(), len, partial); } template<> @@ -11663,60 +12184,76 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b reader.advance(); } -// -// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. -// -// PERF NOTES: -// We pipe 2 inputs through these stages: -// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load -// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. -// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. -// The output of step 1 depends entirely on this information. These functions don't quite use -// up enough CPU: the second half of the functions is highly serial, only using 1 execution core -// at a time. The second input's scans has some dependency on the first ones finishing it, but -// they can make a lot of progress before they need that information. -// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that -// to finish: utf-8 checks and generating the output from the last iteration. -// -// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all -// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough -// workout. -// -// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. -// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, -// you may want to call on a function like trimmed_length_safe_utf8. -template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { - if (unlikely(len > parser.capacity())) { return CAPACITY; } +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} + +really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); + + error_code error = scanner.finish(partial); + if (unlikely(error != SUCCESS)) { return error; } - buf_block_reader reader(buf, len); - json_structural_indexer indexer(parser.structural_indexes.get()); - while (reader.has_full_block()) { - indexer.step(reader.full_block(), reader); + if (unescaped_chars_error) { + return UNESCAPED_CHARS; } - if (likely(reader.has_remainder())) { - uint8_t block[STEP_SIZE]; - reader.get_remainder(block); - indexer.step(block, reader); + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /*** + * This is related to https://github.com/simdjson/simdjson/issues/906 + * Basically, we want to make sure that if the parsing continues beyond the last (valid) + * structural character, it quickly stops. + * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. + * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing + * continues, then it must be [,] or }. + * Suppose it is ] or }. We backtrack to the first character, what could it be that would + * not trigger an error? It could be ] or } but no, because you can't start a document that way. + * It can't be a comma, a colon or any simple value. So the only way we could continue is + * if the repeated character is [. But if so, the document must start with [. But if the document + * starts with [, it should end with ]. If we enforce that rule, then we would get + * ][[ which is invalid. + **/ + parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); + parser.structural_indexes[parser.n_structural_indexes + 2] = 0; + parser.next_structural_index = 0; + // a valid JSON file cannot have zero structural indexes - we should have found something + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; } - - return indexer.finish(parser, reader.block_index(), len, streaming); + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (partial) { + auto new_structural_indexes = find_next_document_index(parser); + if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { + return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. + } + parser.n_structural_indexes = new_structural_indexes; + } + return checker.errors(); } } // namespace stage1 /* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { - return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); +WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { + this->buf = _buf; + this->len = _len; + return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); } } // namespace westmere - } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage1/json_structural_indexer.h */ -/* begin file src/westmere/stage2.cpp */ -/* westmere/implementation.h already included: #include "westmere/implementation.h" */ + +// +// Stage 2 +// /* begin file src/westmere/stringparsing.h */ #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H #define SIMDJSON_WESTMERE_STRINGPARSING_H @@ -12130,10 +12667,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 × + // can't fit in binary64. The maximal value is about 1.7976931348623157 x // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe.  The estimate for the total number + // are about 10^80 atoms in the universe. The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -12153,26 +12690,6 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } -// We need to check that the character following a zero is valid. This is -// probably frequent and it is harder than it looks. We are building all of this -// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... -const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, - 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - -really_inline bool -is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { - return structural_or_whitespace_or_exponent_or_decimal_negated[c]; -} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -12250,14 +12767,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.write_s64(signed_answer); + writer.append_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -12270,12 +12787,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.write_s64(i); + writer.append_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.write_u64(i); + writer.append_u64(i); } } return is_structural_or_whitespace(*p); @@ -12285,7 +12802,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -12309,10 +12826,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W writer) { + W &writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.write_s64(0); // always write zero + writer.append_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -12332,7 +12849,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { + if (is_integer(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -12456,7 +12973,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (digit_count >= 19) { + if (unlikely(digit_count >= 19)) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -12464,14 +12981,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_double(); + return success; } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - return slow_float_parsing((const char *) src, writer); + bool success = slow_float_parsing((const char *) src, writer); + // The number was already written, but we made a copy of the writer when we passed it to the + // slow_float_parsing() function, so we have to skip those tape spots now that we've returned + writer.skip_double(); + return success; } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -12480,7 +13005,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.write_double(d); + writer.append_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -12495,10 +13020,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - return parse_large_integer(src, writer, found_minus); + bool success = parse_large_integer(src, writer, found_minus); + // The number was already written, but we made a copy of the writer + // when we passed it to the parse_large_integer() function, so + writer.skip_large_integer(); + return success; } i = negative ? 0 - i : i; - writer.write_s64(i); + writer.append_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -12523,6 +13052,72 @@ TARGET_WESTMERE namespace simdjson { namespace westmere { +/* begin file src/generic/stage2/logger.h */ +// This is for an internal-only stage 2 specific logger. +// Set LOG_ENABLED = true to log what stage 2 is doing! +namespace logger { + static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; + + static constexpr const bool LOG_ENABLED = false; + static constexpr const int LOG_EVENT_LEN = 30; + static constexpr const int LOG_BUFFER_LEN = 20; + static constexpr const int LOG_DETAIL_LEN = 50; + static constexpr const int LOG_INDEX_LEN = 10; + + static int log_depth; // Not threadsafe. Log only. + + // Helper to turn unprintable or newline characters into spaces + static really_inline char printable_char(char c) { + if (c >= 0x20) { + return c; + } else { + return ' '; + } + } + + // Print the header and set up log_start + static really_inline void log_start() { + if (LOG_ENABLED) { + log_depth = 0; + printf("\n"); + printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); + printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); + } + } + + static really_inline void log_string(const char *message) { + if (LOG_ENABLED) { + printf("%s\n", message); + } + } + + // Logs a single line of + template + static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { + if (LOG_ENABLED) { + printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); + { + // Print the next N characters in the buffer. + printf("| "); + // Otherwise, print the characters starting from the buffer position. + // Print spaces for unprintable or newline characters. + for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -12617,32 +13220,25 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, len); - memset(copy + len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), idx); + memcpy(copy, buf, parser.len); + memset(copy + parser.len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), *current_structural); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return next_structural+1 > n_structural_indexes; + return current_structural >= &parser.structural_indexes[n_structural_indexes]; } really_inline bool at_end(uint32_t n_structural_indexes) { - return next_structural+1 == n_structural_indexes; + return current_structural == &parser.structural_indexes[n_structural_indexes]; } - really_inline size_t next_structural_index() { - return next_structural; + really_inline bool at_beginning() { + return current_structural == parser.structural_indexes.get(); } - - const uint8_t* const buf; - const size_t len; - const uint32_t* const structural_indexes; - size_t next_structural; // next structural index - size_t idx{0}; // location of the structural character in the input (buf) - uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -12654,8 +13250,105 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { +namespace { // Make everything here private + +/* begin file src/generic/stage2/tape_writer.h */ +struct tape_writer { + /** The next place to write to tape */ + uint64_t *next_tape_loc; + + /** Write a signed 64-bit value to tape. */ + really_inline void append_s64(int64_t value) noexcept; + + /** Write an unsigned 64-bit value to tape. */ + really_inline void append_u64(uint64_t value) noexcept; + + /** Write a double value to tape. */ + really_inline void append_double(double value) noexcept; + + /** + * Append a tape entry (an 8-bit type,and 56 bits worth of value). + */ + really_inline void append(uint64_t val, internal::tape_type t) noexcept; + + /** + * Skip the current tape entry without writing. + * + * Used to skip the start of the container, since we'll come back later to fill it in when the + * container ends. + */ + really_inline void skip() noexcept; + + /** + * Skip the number of tape entries necessary to write a large u64 or i64. + */ + really_inline void skip_large_integer() noexcept; + + /** + * Skip the number of tape entries necessary to write a double. + */ + really_inline void skip_double() noexcept; + + /** + * Write a value to a known location on tape. + * + * Used to go back and write out the start of a container after the container ends. + */ + really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; + +private: + /** + * Append both the tape entry, and a supplementary value following it. Used for types that need + * all 64 bits, such as double and uint64_t. + */ + template + really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; +}; // struct number_writer + +really_inline void tape_writer::append_s64(int64_t value) noexcept { + append2(0, value, internal::tape_type::INT64); +} + +really_inline void tape_writer::append_u64(uint64_t value) noexcept { + append(0, internal::tape_type::UINT64); + *next_tape_loc = value; + next_tape_loc++; +} -using internal::ret_address; +/** Write a double value to tape. */ +really_inline void tape_writer::append_double(double value) noexcept { + append2(0, value, internal::tape_type::DOUBLE); +} + +really_inline void tape_writer::skip() noexcept { + next_tape_loc++; +} + +really_inline void tape_writer::skip_large_integer() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::skip_double() noexcept { + next_tape_loc += 2; +} + +really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { + *next_tape_loc = val | ((uint64_t(char(t))) << 56); + next_tape_loc++; +} + +template +really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { + append(val, t); + static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); + memcpy(next_tape_loc, &val2, sizeof(val2)); + next_tape_loc++; +} + +really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { + tape_loc = val | ((uint64_t(char(t))) << 56); +} +/* end file src/generic/stage2/tape_writer.h */ #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -12686,102 +13379,88 @@ using internal::ret_address; #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address array_begin; - ret_address array_continue; - ret_address error; - ret_address finish; - ret_address object_begin; - ret_address object_continue; + ret_address_t array_begin; + ret_address_t array_continue; + ret_address_t error; + ret_address_t finish; + ret_address_t object_begin; + ret_address_t object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct number_writer { - parser &doc_parser; - - really_inline void write_s64(int64_t value) noexcept { - write_tape(0, internal::tape_type::INT64); - std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); - ++doc_parser.current_loc; - } - really_inline void write_u64(uint64_t value) noexcept { - write_tape(0, internal::tape_type::UINT64); - doc_parser.doc.tape[doc_parser.current_loc++] = value; - } - really_inline void write_double(double value) noexcept { - write_tape(0, internal::tape_type::DOUBLE); - static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); - memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); - // doc.tape[doc.current_loc++] = *((uint64_t *)&d); - } - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); - } -}; // struct number_writer - -struct structural_parser { - structural_iterator structurals; - parser &doc_parser; +struct structural_parser : structural_iterator { + /** Lets you append to the tape */ + tape_writer tape; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc{}; - uint32_t depth; - - really_inline structural_parser( - const uint8_t *buf, - size_t len, - parser &_doc_parser, - uint32_t next_structural = 0 - ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} - - WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { - doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; - doc_parser.containing_scope[depth].count = 0; - write_tape(0, type); // if the document is correct, this gets rewritten later - doc_parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc; + /** Current depth (nested objects and arrays) */ + uint32_t depth{0}; + + // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations + really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) + : structural_iterator(_parser, start_structural_index), + tape{parser.doc->tape.get()}, + current_string_buf_loc{parser.doc->string_buf.get()} { + } + + WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { + parser.containing_scope[depth].tape_index = next_tape_index(); + parser.containing_scope[depth].count = 0; + tape.skip(); // We don't actually *write* the start element until the end. + parser.ret_address[depth] = continue_state; depth++; - return depth >= doc_parser.max_depth(); + bool exceeded_max_depth = depth >= parser.max_depth(); + if (exceeded_max_depth) { log_error("Exceeded max depth!"); } + return exceeded_max_depth; } - WARN_UNUSED really_inline bool start_document(ret_address continue_state) { - return start_scope(internal::tape_type::ROOT, continue_state); + WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { + log_start_value("document"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address continue_state) { - return start_scope(internal::tape_type::START_OBJECT, continue_state); + WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { + log_start_value("object"); + return start_scope(continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address continue_state) { - return start_scope(internal::tape_type::START_ARRAY, continue_state); + WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { + log_start_value("array"); + return start_scope(continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type type) noexcept { + really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { depth--; - // write our doc.tape location to the header scope + // write our doc->tape location to the header scope // The root scope gets written *at* the previous location. - write_tape(doc_parser.containing_scope[depth].tape_index, type); + tape.append(parser.containing_scope[depth].tape_index, end); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; - const uint32_t count = doc_parser.containing_scope[depth].count; + const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; + const uint32_t count = parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] - doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); + // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] + tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); + } + + really_inline uint32_t next_tape_index() { + return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); } really_inline void end_object() { - end_scope(internal::tape_type::END_OBJECT); + log_end_value("object"); + end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); } really_inline void end_array() { - end_scope(internal::tape_type::END_ARRAY); + log_end_value("array"); + end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); } really_inline void end_document() { - end_scope(internal::tape_type::ROOT); - } - - really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { - doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + log_end_value("document"); + end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); } // increment_count increments the count of keys in an object or values in an array. @@ -12789,17 +13468,16 @@ struct structural_parser { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - /* we advance the point, accounting for the fact that we have a NULL - * termination */ - write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); + // we advance the point, accounting for the fact that we have a NULL termination + tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline bool on_end_string(uint8_t *dst) noexcept { + really_inline void on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -12809,73 +13487,49 @@ struct structural_parser { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; - return true; } - WARN_UNUSED really_inline bool parse_string() { + WARN_UNUSED really_inline bool parse_string(bool key = false) { + log_value(key ? "key" : "string"); uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(structurals.current(), dst); + dst = stringparsing::parse_string(current(), dst); if (dst == nullptr) { + log_error("Invalid escape in string"); return true; } - return !on_end_string(dst); + on_end_string(dst); + return false; } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - number_writer writer{doc_parser}; - return !numberparsing::parse_number(src, found_minus, writer); + log_value("number"); + bool succeeded = numberparsing::parse_number(src, found_minus, tape); + if (!succeeded) { log_error("Invalid number"); } + return !succeeded; } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(structurals.current(), found_minus); - } - - WARN_UNUSED really_inline bool parse_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; - } - - WARN_UNUSED really_inline bool parse_single_atom() { - switch (structurals.current_char()) { - case 't': - if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::TRUE_VALUE); - break; - case 'f': - if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::FALSE_VALUE); - break; - case 'n': - if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } - write_tape(0, internal::tape_type::NULL_VALUE); - break; - default: - return true; - } - return false; + return parse_number(current(), found_minus); } - WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { - switch (structurals.current_char()) { + WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { + switch (advance_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': case 'f': case 'n': - FAIL_IF( parse_atom() ); + case 't': + log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); + tape.append(0, internal::tape_type::TRUE_VALUE); + return continue_state; + case 'f': + log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); + tape.append(0, internal::tape_type::FALSE_VALUE); + return continue_state; + case 'n': + log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); + tape.append(0, internal::tape_type::NULL_VALUE); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -12891,40 +13545,27 @@ struct structural_parser { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: + log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { - // the string might not be NULL terminated. - if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } end_document(); + parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); + if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); + log_error("Unclosed objects or arrays!"); + return parser.error = TAPE_ERROR; } - return on_success(SUCCESS); - } - - really_inline error_code on_error(error_code new_error_code) noexcept { - doc_parser.error = new_error_code; - return new_error_code; - } - really_inline error_code on_success(error_code success_code) noexcept { - doc_parser.error = success_code; - doc_parser.valid = true; - return success_code; + return SUCCESS; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by doc_parser.init_stage2(), + /* We do not need the next line because this is done by parser.init_stage2(), * pessimistically. - * doc_parser.is_valid = false; + * parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -12932,12 +13573,12 @@ struct structural_parser { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= doc_parser.max_depth()) { - return on_error(DEPTH_ERROR); + if (depth >= parser.max_depth()) { + return parser.error = DEPTH_ERROR; } - switch (structurals.current_char()) { + switch (current_char()) { case '"': - return on_error(STRING_ERROR); + return parser.error = STRING_ERROR; case '0': case '1': case '2': @@ -12949,92 +13590,124 @@ struct structural_parser { case '8': case '9': case '-': - return on_error(NUMBER_ERROR); + return parser.error = NUMBER_ERROR; case 't': - return on_error(T_ATOM_ERROR); + return parser.error = T_ATOM_ERROR; case 'n': - return on_error(N_ATOM_ERROR); + return parser.error = N_ATOM_ERROR; case 'f': - return on_error(F_ATOM_ERROR); + return parser.error = F_ATOM_ERROR; default: - return on_error(TAPE_ERROR); + return parser.error = TAPE_ERROR; } } really_inline void init() { - current_string_buf_loc = doc_parser.doc.string_buf.get(); - doc_parser.current_loc = 0; - doc_parser.valid = false; - doc_parser.error = UNINITIALIZED; + log_start(); + parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { - init(); // sets is_valid to false - if (len > doc_parser.capacity()) { - return CAPACITY; + WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { + // If there are no structurals left, return EMPTY + if (at_end(parser.n_structural_indexes)) { + return parser.error = EMPTY; } - // Advance to the first character as soon as possible - structurals.advance_char(); + + init(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return on_error(DEPTH_ERROR); + return parser.error = DEPTH_ERROR; } return SUCCESS; } - really_inline char advance_char() { - return structurals.advance_char(); + really_inline void log_value(const char *type) { + logger::log_line(*this, "", type, ""); } -}; + + static really_inline void log_start() { + logger::log_start(); + } + + really_inline void log_start_value(const char *type) { + logger::log_line(*this, "+", type, ""); + if (logger::LOG_ENABLED) { logger::log_depth++; } + } + + really_inline void log_end_value(const char *type) { + if (logger::LOG_ENABLED) { logger::log_depth--; } + logger::log_line(*this, "-", type, ""); + } + + really_inline void log_error(const char *error) { + logger::log_line(*this, "", "ERROR", error); + } +}; // struct structural_parser // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -} // namespace stage2 - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { +template +WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { + dom_parser.doc = &doc; static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(buf, len, doc_parser); - error_code result = parser.start(len, addresses.finish); + stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); + error_code result = parser.start(addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.structurals.current_char()) { + switch (parser.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); + // Make sure the outer array is closed before continuing; otherwise, there are ways we could get + // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 + if (!STREAMING) { + if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { + goto error; + } + } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); + case 't': + parser.log_value("true"); + FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::TRUE_VALUE); + goto finish; + case 'f': + parser.log_value("false"); + FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::FALSE_VALUE); + goto finish; + case 'n': + parser.log_value("null"); + FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); + parser.tape.append(0, internal::tape_type::NULL_VALUE); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: + parser.log_error("Document starts with a non-value character"); goto error; } @@ -13045,43 +13718,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string() ); + FAIL_IF( parser.parse_string(true) ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: + parser.log_error("Object does not start with a key"); goto error; } object_key_state: - FAIL_IF( parser.advance_char() != ':' ); - parser.advance_char(); + if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); + if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } + FAIL_IF( parser.parse_string(true) ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: + parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + CONTINUE( parser.parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.advance_char() == ']') { + if (parser.peek_next_char() == ']') { + parser.advance_char(); parser.end_array(); goto scope_end; } @@ -13096,12 +13771,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa switch (parser.advance_char()) { case ',': parser.increment_count(); - parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: + parser.log_error("Missing comma between array values"); goto error; } @@ -13112,176 +13787,46 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa return parser.error(); } -WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { - error_code code = stage1(buf, len, doc_parser, false); - if (!code) { - code = stage2(buf, len, doc_parser); - } - return code; -} -/* end file src/generic/stage2/structural_parser.h */ -/* begin file src/generic/stage2/streaming_structural_parser.h */ -namespace stage2 { - -struct streaming_structural_parser: structural_parser { - really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} - - // override to add streaming - WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { - init(); // sets is_valid to false - // Capacity ain't no thang for streaming, so we don't check it. - // Advance to the first character as soon as possible - advance_char(); - // Push the root scope (there is always at least one scope) - if (start_document(finish_parser)) { - return on_error(DEPTH_ERROR); - } - return SUCCESS; - } - - // override to add streaming - WARN_UNUSED really_inline error_code finish() { - if ( structurals.past_end(doc_parser.n_structural_indexes) ) { - return on_error(TAPE_ERROR); - } - end_document(); - if (depth != 0) { - return on_error(TAPE_ERROR); - } - if (doc_parser.containing_scope[depth].tape_index != 0) { - return on_error(TAPE_ERROR); - } - bool finished = structurals.at_end(doc_parser.n_structural_indexes); - return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); - } -}; - +} // namespace {} } // namespace stage2 /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { - static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); - error_code result = parser.start(len, addresses.finish); +WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { + error_code result = stage2::parse_structurals(*this, _doc); if (result) { return result; } - // - // Read first value - // - switch (parser.structurals.current_char()) { - case '{': - FAIL_IF( parser.start_object(addresses.finish) ); - goto object_begin; - case '[': - FAIL_IF( parser.start_array(addresses.finish) ); - goto array_begin; - case '"': - FAIL_IF( parser.parse_string() ); - goto finish; - case 't': case 'f': case 'n': - FAIL_IF( parser.parse_single_atom() ); - goto finish; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], false); - }) - ); - goto finish; - case '-': - FAIL_IF( - parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { - return parser.parse_number(©[idx], true); - }) - ); - goto finish; - default: - goto error; - } -// -// Object parser parsers -// -object_begin: - switch (parser.advance_char()) { - case '"': { - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - } - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -object_key_parser: - FAIL_IF( parser.advance_char() != ':' ); - parser.increment_count(); - parser.advance_char(); - GOTO( parser.parse_value(addresses, addresses.object_continue) ); - -object_continue: - switch (parser.advance_char()) { - case ',': - FAIL_IF( parser.advance_char() != '"' ); - FAIL_IF( parser.parse_string() ); - goto object_key_parser; - case '}': - parser.end_object(); - goto scope_end; - default: - goto error; - } - -scope_end: - CONTINUE( parser.doc_parser.ret_address[parser.depth] ); - -// -// Array parser parsers -// -array_begin: - if (parser.advance_char() == ']') { - parser.end_array(); - goto scope_end; + // If we didn't make it to the end, it's an error + if ( next_structural_index != n_structural_indexes ) { + logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); + return error = TAPE_ERROR; } - parser.increment_count(); - -main_array_switch: - /* we call update char on all paths in, so we can peek at parser.c on the - * on paths that can accept a close square brace (post-, and at start) */ - GOTO( parser.parse_value(addresses, addresses.array_continue) ); -array_continue: - switch (parser.advance_char()) { - case ',': - parser.increment_count(); - parser.advance_char(); - goto main_array_switch; - case ']': - parser.end_array(); - goto scope_end; - default: - goto error; - } + return SUCCESS; +} -finish: - next_json = parser.structurals.next_structural_index(); - return parser.finish(); +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { + return stage2::parse_structurals(*this, _doc); +} +/* end file src/generic/stage2/tape_writer.h */ -error: - return parser.error(); +WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { + error_code err = stage1(_buf, _len, false); + if (err) { return err; } + return stage2(_doc); } -/* end file src/generic/stage2/streaming_structural_parser.h */ } // namespace westmere } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage2/streaming_structural_parser.h */ +/* end file src/generic/stage2/tape_writer.h */ #endif SIMDJSON_POP_DISABLE_WARNINGS -/* end file src/generic/stage2/streaming_structural_parser.h */ +/* end file src/generic/stage2/tape_writer.h */ diff --git a/inst/include/simdjson.h b/inst/include/simdjson.h index 0a1d140..21efa8e 100644 --- a/inst/include/simdjson.h +++ b/inst/include/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */ +/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */ /* begin file include/simdjson.h */ #ifndef SIMDJSON_H #define SIMDJSON_H @@ -2030,7 +2030,6 @@ namespace simdjson { */ enum error_code { SUCCESS = 0, ///< No error - SUCCESS_AND_HAS_MORE, ///< @private No error and buffer still has more data CAPACITY, ///< This parser can't support a document that big MEMALLOC, ///< Error allocating memory, most likely out of memory TAPE_ERROR, ///< Something went wrong while writing to the tape (stage 2), this is a generic error @@ -2409,6 +2408,187 @@ inline char *allocate_padded_buffer(size_t length) noexcept; #ifndef SIMDJSON_IMPLEMENTATION_H #define SIMDJSON_IMPLEMENTATION_H +/* begin file include/simdjson/internal/dom_parser_implementation.h */ +#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H +#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H + +#include + +namespace simdjson { + +namespace dom { +class document; +} // namespace dom + +namespace internal { + +/** + * An implementation of simdjson's DOM parser for a particular CPU architecture. + * + * This class is expected to be accessed only by pointer, and never move in memory (though the + * pointer can move). + */ +class dom_parser_implementation { +public: + + /** + * @private For internal implementation use + * + * Run a full JSON parse on a single document (stage1 + stage2). + * + * Guaranteed only to be called when capacity > document length. + * + * Overridden by each implementation. + * + * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len The length of the json document. + * @return The error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 1 of the document parser. + * + * Guaranteed only to be called when capacity > document length. + * + * Overridden by each implementation. + * + * @param buf The json document to parse. + * @param len The length of the json document. + * @param streaming Whether this is being called by parser::parse_many. + * @return The error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser. + * + * Called after stage1(). + * + * Overridden by each implementation. + * + * @param doc The document to output to. + * @return The error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code stage2(dom::document &doc) noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser for parser::parse_many. + * + * Guaranteed only to be called after stage1(). + * Overridden by each implementation. + * + * @param doc The document to output to. + * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed. + */ + WARN_UNUSED virtual error_code stage2_next(dom::document &doc) noexcept = 0; + + /** + * Change the capacity of this parser. + * + * Generally used for reallocation. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. + * @return The error code, or SUCCESS if there was no error. + */ + virtual error_code set_capacity(size_t capacity) noexcept = 0; + + /** + * Change the max depth of this parser. + * + * Generally used for reallocation. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. + * @return The error code, or SUCCESS if there was no error. + */ + virtual error_code set_max_depth(size_t max_depth) noexcept = 0; + + /** + * Deallocate this parser. + */ + virtual ~dom_parser_implementation() = default; + + /** Number of structural indices passed from stage 1 to stage 2 */ + uint32_t n_structural_indexes{0}; + /** Structural indices passed from stage 1 to stage 2 */ + std::unique_ptr structural_indexes{}; + /** Next structural index to parse */ + uint32_t next_structural_index{0}; + + /** + * The largest document this parser can support without reallocating. + * + * @return Current capacity, in bytes. + */ + really_inline size_t capacity() const noexcept; + + /** + * The maximum level of nested object and arrays supported by this parser. + * + * @return Maximum depth, in bytes. + */ + really_inline size_t max_depth() const noexcept; + + /** + * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length + * and `max_depth` depth. + * + * @param capacity The new capacity. + * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. + * @return The error, if there is one. + */ + WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth) noexcept; + +protected: + /** + * The maximum document length this parser supports. + * + * Buffers are large enough to handle any document up to this length. + */ + size_t _capacity{0}; + + /** + * The maximum depth (number of nested objects and arrays) supported by this parser. + * + * Defaults to DEFAULT_MAX_DEPTH. + */ + size_t _max_depth{0}; +}; // class dom_parser_implementation + +really_inline size_t dom_parser_implementation::capacity() const noexcept { + return _capacity; +} + +really_inline size_t dom_parser_implementation::max_depth() const noexcept { + return _max_depth; +} + +WARN_UNUSED +inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept { + if (this->max_depth() != max_depth) { + error_code err = set_max_depth(max_depth); + if (err) { return err; } + } + if (_capacity != capacity) { + error_code err = set_capacity(capacity); + if (err) { return err; } + } + return SUCCESS; +} + +} // namespace internal +} // namespace simdjson + +#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H +/* end file include/simdjson/internal/dom_parser_implementation.h */ #include #include #include @@ -2417,8 +2597,8 @@ inline char *allocate_padded_buffer(size_t length) noexcept; namespace simdjson { namespace dom { - class parser; -} + class document; +} // namespace dom /** * An implementation of simdjson for a particular CPU architecture. @@ -2461,16 +2641,19 @@ class implementation { /** * @private For internal implementation use * - * Run a full document parse (ensure_capacity, stage1 and stage2). - * - * Overridden by each implementation. + * const implementation *impl = simdjson::active_implementation; + * cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; * - * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. - * @param len the length of the json document. - * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. - * @return the error code, or SUCCESS if there was no error. + * @param capacity The largest document that will be passed to the parser. + * @param max_depth The maximum JSON object/array nesting this parser is expected to handle. + * @param dst The place to put the resulting parser implementation. + * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" */ - WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0; + virtual error_code create_dom_parser_implementation( + size_t capacity, + size_t max_depth, + std::unique_ptr &dst + ) const noexcept = 0; /** * @private For internal implementation use @@ -2487,50 +2670,6 @@ class implementation { */ WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0; - /** - * @private For internal implementation use - * - * Stage 1 of the document parser. - * - * Overridden by each implementation. - * - * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. - * @param len the length of the json document. - * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. - * @param streaming whether this is being called by parser::parse_many. - * @return the error code, or SUCCESS if there was no error. - */ - WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept = 0; - - /** - * @private For internal implementation use - * - * Stage 2 of the document parser. - * - * Overridden by each implementation. - * - * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. - * @param len the length of the json document. - * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. - * @return the error code, or SUCCESS if there was no error. - */ - WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0; - - /** - * @private For internal implementation use - * - * Stage 2 of the document parser for parser::parse_many. - * - * Overridden by each implementation. - * - * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. - * @param len the length of the json document. - * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. - * @param next_json the next structural index. Start this at 0 the first time, and it will be updated to the next value to pass each time. - * @return the error code, SUCCESS if there was no error, or SUCCESS_AND_HAS_MORE if there was no error and stage2 can be called again. - */ - WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept = 0; - protected: /** @private Construct an implementation with the given name and description. For subclasses. */ really_inline implementation( @@ -2648,7 +2787,7 @@ extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr activ } // namespace simdjson #endif // SIMDJSON_IMPLEMENTATION_H -/* end file include/simdjson/implementation.h */ +/* end file include/simdjson/internal/dom_parser_implementation.h */ /* begin file include/simdjson/dom/array.h */ #ifndef SIMDJSON_DOM_ARRAY_H #define SIMDJSON_DOM_ARRAY_H @@ -3022,22 +3161,6 @@ class document { namespace simdjson { -namespace internal { - -// expectation: sizeof(scope_descriptor) = 64/8. -struct scope_descriptor { - uint32_t tape_index; // where, on the tape, does the scope ([,{) begins - uint32_t count; // how many elements in the scope -}; // struct scope_descriptor - -#ifdef SIMDJSON_USE_COMPUTED_GOTO -typedef void* ret_address; -#else -typedef char ret_address; -#endif - -} // namespace internal - namespace dom { class document_stream; @@ -3075,14 +3198,14 @@ class parser { * * @param other The parser to take. Its capacity is zeroed. */ - parser(parser &&other) = default; + really_inline parser(parser &&other) noexcept; parser(const parser &) = delete; ///< @private Disallow copying /** * Take another parser's buffers and state. * * @param other The parser to take. Its capacity is zeroed. */ - parser &operator=(parser &&other) = default; + really_inline parser &operator=(parser &&other) noexcept; parser &operator=(const parser &) = delete; ///< @private Disallow copying /** Deallocate the JSON parser. */ @@ -3342,7 +3465,8 @@ class parser { /** * Set max_capacity. This is the largest document this parser can automatically support. * - * The parser may reallocate internal buffers as needed up to this amount. + * The parser may reallocate internal buffers as needed up to this amount as documents are passed + * to it. * * This call will not allocate or deallocate, even if capacity is currently above max_capacity. * @@ -3355,19 +3479,8 @@ class parser { /** @private Use simdjson_error instead */ using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error; - /** @private Next location to write to in the tape */ - uint32_t current_loc{0}; - - /** @private Number of structural indices passed from stage 1 to stage 2 */ - uint32_t n_structural_indexes{0}; - /** @private Structural indices passed from stage 1 to stage 2 */ - std::unique_ptr structural_indexes{}; - - /** @private Tape location of each open { or [ */ - std::unique_ptr containing_scope{}; - - /** @private Return address of each open { or [ */ - std::unique_ptr ret_address{}; + /** @private [for benchmarking access] The implementation to use */ + std::unique_ptr implementation{}; /** @private Use `if (parser.parse(...).error())` instead */ bool valid{false}; @@ -3407,20 +3520,6 @@ class parser { */ size_t _max_capacity; - /** - * The maximum document length this parser supports. - * - * Buffers are large enough to handle any document up to this length. - */ - size_t _capacity{0}; - - /** - * The maximum depth (number of nested objects and arrays) supported by this parser. - * - * Defaults to DEFAULT_MAX_DEPTH. - */ - size_t _max_depth{0}; - /** * The loaded buffer (reused each time load() is called) */ @@ -3500,7 +3599,7 @@ class document_stream { really_inline bool operator!=(const iterator &other) const noexcept; private: - iterator(document_stream& stream, bool finished) noexcept; + really_inline iterator(document_stream &s, bool finished) noexcept; /** The document_stream we're iterating through. */ document_stream& stream; /** Whether we're finished or not. */ @@ -3523,7 +3622,23 @@ class document_stream { document_stream(document_stream &other) = delete; // Disallow copying - really_inline document_stream(dom::parser &parser, const uint8_t *buf, size_t len, size_t batch_size, error_code error = SUCCESS) noexcept; + /** + * Construct a document_stream. Does not allocate or parse anything until the iterator is + * used. + */ + really_inline document_stream( + dom::parser &parser, + const uint8_t *buf, + size_t len, + size_t batch_size, + error_code error = SUCCESS + ) noexcept; + + /** + * Parse the first document in the buffer. Used by begin(), to handle allocation and + * initialization. + */ + inline void start() noexcept; /** * Parse the next document found in the buffer previously given to document_stream. @@ -3536,10 +3651,7 @@ class document_stream { * pre-allocating a capacity defined by the batch_size defined when creating the * document_stream object. * - * The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in case - * of success and indicates that the buffer still contains more data to be parsed, - * meaning this function can be called again to return the next JSON document - * after this one. + * The function returns simdjson::EMPTY if there is no more data to be parsed. * * The function returns simdjson::SUCCESS (as integer = 0) in case of success * and indicates that the buffer has successfully been parsed to the end. @@ -3550,55 +3662,51 @@ class document_stream { * the simdjson::error_message function converts these error codes into a string). * * You can also check validity by calling parser.is_valid(). The same parser can - * and should be reused for the other documents in the buffer. */ - inline error_code json_parse() noexcept; - - /** - * Returns the location (index) of where the next document should be in the - * buffer. - * Can be used for debugging, it tells the user the position of the end of the - * last - * valid JSON document parsed + * and should be reused for the other documents in the buffer. */ - inline size_t get_current_buffer_loc() const { return current_buffer_loc; } + inline void next() noexcept; /** - * Returns the total amount of complete documents parsed by the document_stream, - * in the current buffer, at the given time. + * Pass the next batch through stage 1 and return when finished. + * When threads are enabled, this may wait for the stage 1 thread to finish. */ - inline size_t get_n_parsed_docs() const { return n_parsed_docs; } - - /** - * Returns the total amount of data (in bytes) parsed by the document_stream, - * in the current buffer, at the given time. - */ - inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; } - - inline const uint8_t *buf() const { return _buf + buf_start; } + inline void load_batch() noexcept; - inline void advance(size_t offset) { buf_start += offset; } + /** Get the next document index. */ + inline size_t next_batch_start() const noexcept; - inline size_t remaining() const { return _len - buf_start; } + /** Pass the next batch through stage 1 with the given parser. */ + inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept; dom::parser &parser; - const uint8_t *_buf; - const size_t _len; - size_t _batch_size; // this is actually variable! - size_t buf_start{0}; - size_t next_json{0}; - bool load_next_batch{true}; - size_t current_buffer_loc{0}; -#ifdef SIMDJSON_THREADS_ENABLED - size_t last_json_buffer_loc{0}; -#endif - size_t n_parsed_docs{0}; - size_t n_bytes_parsed{0}; - error_code error{SUCCESS_AND_HAS_MORE}; + const uint8_t *buf; + const size_t len; + const size_t batch_size; + size_t batch_start{0}; + /** The error (or lack thereof) from the current document. */ + error_code error; + #ifdef SIMDJSON_THREADS_ENABLED - error_code stage1_is_ok_thread{SUCCESS}; - std::thread stage_1_thread{}; - dom::parser parser_thread{}; -#endif + inline void load_from_stage1_thread() noexcept; + + /** Start a thread to run stage 1 on the next batch. */ + inline void start_stage1_thread() noexcept; + + /** Wait for the stage 1 thread to finish and capture the results. */ + inline void finish_stage1_thread() noexcept; + + /** The error returned from the stage 1 thread. */ + error_code stage1_thread_error{UNINITIALIZED}; + /** The thread used to run stage 1 against the next batch in the background. */ + std::thread stage1_thread{}; + + /** + * The parser used to run stage 1 in the background. Will be swapped + * with the regular parser when finished. + */ + dom::parser stage1_thread_parser{}; +#endif // SIMDJSON_THREADS_ENABLED + friend class dom::parser; }; // class document_stream @@ -4842,125 +4950,37 @@ inline std::ostream& operator<<(std::ostream& out, const simdjson_result #include -namespace simdjson { -namespace internal { - -/** - * This algorithm is used to quickly identify the buffer position of - * the last JSON document inside the current batch. - * - * It does its work by finding the last pair of structural characters - * that represent the end followed by the start of a document. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and means that we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete - * document, therefore the last json buffer location is the end of the batch - * */ -inline uint32_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const dom::parser &parser) { - // this function can be generally useful - if (parser.n_structural_indexes == 0) - return 0; - auto last_i = parser.n_structural_indexes - 1; - if (parser.structural_indexes[last_i] == size) { - if (last_i == 0) - return 0; - last_i = parser.n_structural_indexes - 2; - } - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = last_i; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - if (!arr_cnt && !obj_cnt) { - return last_i + 1; - } - return i; - } - return 0; -} - -// returns true if the provided byte value is an ASCII character -static inline bool is_ascii(char c) { - return ((unsigned char)c) <= 127; -} - -// if the string ends with UTF-8 values, backtrack -// up to the first ASCII character. May return 0. -static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) { - while ((len > 0) and (not is_ascii(c[len - 1]))) { - len--; - } - return len; -} - -} // namespace internal - -} // namespace simdjson - namespace simdjson { namespace dom { + really_inline document_stream::document_stream( dom::parser &_parser, - const uint8_t *buf, - size_t len, - size_t batch_size, + const uint8_t *_buf, + size_t _len, + size_t _batch_size, error_code _error ) noexcept : parser{_parser}, - _buf{buf}, - _len{len}, - _batch_size(batch_size), - error(_error) + buf{_buf}, + len{_len}, + batch_size{_batch_size}, + error{_error} { - if (!error) { error = json_parse(); } } inline document_stream::~document_stream() noexcept { #ifdef SIMDJSON_THREADS_ENABLED - if (stage_1_thread.joinable()) { - stage_1_thread.join(); + // TODO kill the thread, why should people have to wait for a non-side-effecting operation to complete + if (stage1_thread.joinable()) { + stage1_thread.join(); } #endif } really_inline document_stream::iterator document_stream::begin() noexcept { - return iterator(*this, false); + start(); + // If there are no documents, we're finished. + return iterator(*this, error == EMPTY); } really_inline document_stream::iterator document_stream::end() noexcept { @@ -4972,17 +4992,15 @@ really_inline document_stream::iterator::iterator(document_stream& _stream, bool } really_inline simdjson_result document_stream::iterator::operator*() noexcept { - error_code err = stream.error == SUCCESS_AND_HAS_MORE ? SUCCESS : stream.error; - if (err) { return err; } + // Once we have yielded any errors, we're finished. + if (stream.error) { finished = true; return stream.error; } return stream.parser.doc.root(); } really_inline document_stream::iterator& document_stream::iterator::operator++() noexcept { - if (stream.error == SUCCESS_AND_HAS_MORE) { - stream.error = stream.json_parse(); - } else { - finished = true; - } + stream.next(); + // If that was the last document, we're finished. + if (stream.error == EMPTY) { finished = true; } return *this; } @@ -4990,130 +5008,96 @@ really_inline bool document_stream::iterator::operator!=(const document_stream:: return finished != other.finished; } +inline void document_stream::start() noexcept { + if (error) { return; } + + error = parser.ensure_capacity(batch_size); + if (error) { return; } + + // Always run the first stage 1 parse immediately + batch_start = 0; + error = run_stage1(parser, batch_start); + if (error) { return; } + #ifdef SIMDJSON_THREADS_ENABLED + if (next_batch_start() < len) { + // Kick off the first thread if needed + error = stage1_thread_parser.ensure_capacity(batch_size); + if (error) { return; } + start_stage1_thread(); + if (error) { return; } + } +#endif // SIMDJSON_THREADS_ENABLED -// threaded version of json_parse -// todo: simplify this code further -inline error_code document_stream::json_parse() noexcept { - error = parser.ensure_capacity(_batch_size); - if (error) { return error; } - error = parser_thread.ensure_capacity(_batch_size); - if (error) { return error; } - - if (unlikely(load_next_batch)) { - // First time loading - if (!stage_1_thread.joinable()) { - _batch_size = (std::min)(_batch_size, remaining()); - _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size); - if (_batch_size == 0) { - return simdjson::UTF8_ERROR; - } - auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true)); - if (stage1_is_ok != simdjson::SUCCESS) { - return stage1_is_ok; - } - uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser); - if (last_index == 0) { - if (parser.n_structural_indexes == 0) { - return simdjson::EMPTY; - } - } else { - parser.n_structural_indexes = last_index + 1; - } - } - // the second thread is running or done. - else { - stage_1_thread.join(); - if (stage1_is_ok_thread != simdjson::SUCCESS) { - return stage1_is_ok_thread; - } - std::swap(parser.structural_indexes, parser_thread.structural_indexes); - parser.n_structural_indexes = parser_thread.n_structural_indexes; - advance(last_json_buffer_loc); - n_bytes_parsed += last_json_buffer_loc; - } - // let us decide whether we will start a new thread - if (remaining() - _batch_size > 0) { - last_json_buffer_loc = - parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)]; - _batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc); - if (_batch_size > 0) { - _batch_size = internal::trimmed_length_safe_utf8( - (const char *)(buf() + last_json_buffer_loc), _batch_size); - if (_batch_size == 0) { - return simdjson::UTF8_ERROR; - } - // let us capture read-only variables - const uint8_t *const b = buf() + last_json_buffer_loc; - const size_t bs = _batch_size; - // we call the thread on a lambda that will update - // this->stage1_is_ok_thread - // there is only one thread that may write to this value - stage_1_thread = std::thread([this, b, bs] { - this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true)); - }); - } - } - next_json = 0; - load_next_batch = false; - } // load_next_batch - error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json); - if (res == simdjson::SUCCESS_AND_HAS_MORE) { - n_parsed_docs++; - current_buffer_loc = parser.structural_indexes[next_json]; - load_next_batch = (current_buffer_loc == last_json_buffer_loc); - } else if (res == simdjson::SUCCESS) { - n_parsed_docs++; - if (remaining() > _batch_size) { - current_buffer_loc = parser.structural_indexes[next_json - 1]; - load_next_batch = true; - res = simdjson::SUCCESS_AND_HAS_MORE; - } + next(); +} + +inline void document_stream::next() noexcept { + if (error) { return; } + + // Load the next document from the batch + error = parser.implementation->stage2_next(parser.doc); + + // If that was the last document in the batch, load another batch (if available) + while (error == EMPTY) { + batch_start = next_batch_start(); + if (batch_start >= len) { break; } + +#ifdef SIMDJSON_THREADS_ENABLED + load_from_stage1_thread(); +#else + error = run_stage1(parser, batch_start); +#endif + if (error) { continue; } // If the error was EMPTY, we may want to load another batch. + + // Run stage 2 on the first document in the batch + error = parser.implementation->stage2_next(parser.doc); } - return res; } -#else // SIMDJSON_THREADS_ENABLED +inline size_t document_stream::next_batch_start() const noexcept { + return batch_start + parser.implementation->structural_indexes[parser.implementation->n_structural_indexes]; +} -// single-threaded version of json_parse -inline error_code document_stream::json_parse() noexcept { - error = parser.ensure_capacity(_batch_size); - if (error) { return error; } +inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept { + // If this is the final batch, pass partial = false + size_t remaining = len - _batch_start; + if (remaining <= batch_size) { + return p.implementation->stage1(&buf[_batch_start], remaining, false); + } else { + return p.implementation->stage1(&buf[_batch_start], batch_size, true); + } +} - if (unlikely(load_next_batch)) { - advance(current_buffer_loc); - n_bytes_parsed += current_buffer_loc; - _batch_size = (std::min)(_batch_size, remaining()); - _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size); - auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true); - if (stage1_is_ok != simdjson::SUCCESS) { - return stage1_is_ok; - } - uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser); - if (last_index == 0) { - if (parser.n_structural_indexes == 0) { - return EMPTY; - } - } else { - parser.n_structural_indexes = last_index + 1; - } - load_next_batch = false; - } // load_next_batch - error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json); - if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) { - n_parsed_docs++; - current_buffer_loc = parser.structural_indexes[next_json]; - } else if (res == simdjson::SUCCESS) { - n_parsed_docs++; - if (remaining() > _batch_size) { - current_buffer_loc = parser.structural_indexes[next_json - 1]; - next_json = 1; - load_next_batch = true; - res = simdjson::SUCCESS_AND_HAS_MORE; - } +#ifdef SIMDJSON_THREADS_ENABLED + +inline void document_stream::load_from_stage1_thread() noexcept { + stage1_thread.join(); + + // Swap to the parser that was loaded up in the thread. Make sure the parser has + // enough memory to swap to, as well. + std::swap(parser, stage1_thread_parser); + error = stage1_thread_error; + if (error) { return; } + + // If there's anything left, start the stage 1 thread! + if (next_batch_start() < len) { + start_stage1_thread(); } - return res; } + +inline void document_stream::start_stage1_thread() noexcept { + // we call the thread on a lambda that will update + // this->stage1_thread_error + // there is only one thread that may write to this value + // TODO this is NOT exception-safe. + this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error + size_t _next_batch_start = this->next_batch_start(); + stage1_thread = std::thread([this, _next_batch_start] { + this->stage1_thread_error = run_stage1(this->stage1_thread_parser, _next_batch_start); + }); +} + #endif // SIMDJSON_THREADS_ENABLED } // namespace dom @@ -5152,7 +5136,7 @@ inline error_code document::allocate(size_t capacity) noexcept { // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" //where len + 1 tape elements are // generated, see issue https://github.com/lemire/simdjson/issues/345 - size_t tape_capacity = ROUNDUP_N(capacity + 2, 64); + size_t tape_capacity = ROUNDUP_N(capacity + 3, 64); // a document with only zero-length strings... could have len/3 string // and we would need len/3 * 5 bytes on the string buffer size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64); @@ -6741,8 +6725,11 @@ namespace dom { // really_inline parser::parser(size_t max_capacity) noexcept : _max_capacity{max_capacity}, - loaded_bytes(nullptr, &aligned_free_char) - {} + loaded_bytes(nullptr, &aligned_free_char) { +} +really_inline parser::parser(parser &&other) noexcept = default; +really_inline parser &parser::operator=(parser &&other) noexcept = default; + inline bool parser::is_valid() const noexcept { return valid; } inline int parser::get_error_code() const noexcept { return error; } inline std::string parser::get_error_message() const noexcept { return error_message(error); } @@ -6825,15 +6812,12 @@ inline simdjson_result parser::parse(const uint8_t *buf, size_t len, bo memcpy((void *)buf, tmp_buf, len); } - code = simdjson::active_implementation->parse(buf, len, *this); + code = implementation->parse(buf, len, doc); if (realloc_if_needed) { aligned_free((void *)buf); // must free before we exit } if (code) { return code; } - // We're indicating validity via the simdjson_result, so set the parse state back to invalid - valid = false; - error = UNINITIALIZED; return doc.root(); } really_inline simdjson_result parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept { @@ -6860,81 +6844,30 @@ inline document_stream parser::parse_many(const padded_string &s, size_t batch_s } really_inline size_t parser::capacity() const noexcept { - return _capacity; + return implementation ? implementation->capacity() : 0; } really_inline size_t parser::max_capacity() const noexcept { return _max_capacity; } really_inline size_t parser::max_depth() const noexcept { - return _max_depth; + return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH; } WARN_UNUSED inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept { // - // If capacity has changed, reallocate capacity-based buffers - // - if (_capacity != capacity) { - // Set capacity to 0 until we finish, in case there's an error - _capacity = 0; - - // - // Reallocate the document - // - error_code err = doc.allocate(capacity); - if (err) { return err; } - - // - // Don't allocate 0 bytes, just return. - // - if (capacity == 0) { - structural_indexes.reset(); - return SUCCESS; - } - - // - // Initialize stage 1 output - // - size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; - structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc - if (!structural_indexes) { - return MEMALLOC; - } - - _capacity = capacity; - - // - // If capacity hasn't changed, but the document was taken, allocate a new document. + // Reallocate implementation and document if needed // - } else if (!doc.tape) { - error_code err = doc.allocate(capacity); - if (err) { return err; } + error_code err; + if (implementation) { + err = implementation->allocate(capacity, max_depth); + } else { + err = simdjson::active_implementation->create_dom_parser_implementation(capacity, max_depth, implementation); } + if (err) { return err; } - // - // If max_depth has changed, reallocate those buffers - // - if (max_depth != _max_depth) { - _max_depth = 0; - - if (max_depth == 0) { - ret_address.reset(); - containing_scope.reset(); - return SUCCESS; - } - - // - // Initialize stage 2 state - // - containing_scope.reset(new (std::nothrow) internal::scope_descriptor[max_depth]); // TODO realloc - ret_address.reset(new (std::nothrow) internal::ret_address[max_depth]); - - if (!ret_address || !containing_scope) { - // Could not allocate memory - return MEMALLOC; - } - - _max_depth = max_depth; + if (implementation->capacity() != capacity || !doc.tape) { + return doc.allocate(capacity); } return SUCCESS; } @@ -6944,24 +6877,24 @@ inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcep return !allocate(capacity, max_depth); } -really_inline void parser::set_max_capacity(size_t max_capacity) noexcept { - _max_capacity = max_capacity; -} - inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept { // If we don't have enough capacity, (try to) automatically bump it. // If the document was taken, reallocate that too. // Both in one if statement to minimize unlikely branching. - if (unlikely(desired_capacity > capacity() || !doc.tape)) { + if (unlikely(capacity() < desired_capacity || !doc.tape)) { if (desired_capacity > max_capacity()) { return error = CAPACITY; } - return allocate(desired_capacity, _max_depth > 0 ? _max_depth : DEFAULT_MAX_DEPTH); + return allocate(desired_capacity, max_depth()); } return SUCCESS; } +really_inline void parser::set_max_capacity(size_t max_capacity) noexcept { + _max_capacity = max_capacity; +} + } // namespace dom } // namespace simdjson From 5f66c8a764b311c80e374170629655401e0fe5a7 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 18:49:46 -0700 Subject: [PATCH 12/16] re-roxygenize(), re-build, ship --- R/RcppExports.R | 4 ++++ src/RcppExports.cpp | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index 4664673..75710c2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -26,6 +26,10 @@ .Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) } +.load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) { + .Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) +} + .exceptions_enabled <- function() { .Call(`_RcppSimdJson_exceptions_enabled`) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index df640d0..a149941 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -22,6 +22,23 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// load_json +SEXP load_json(const std::string& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); +RcppExport SEXP _RcppSimdJson_load_json(SEXP file_pathSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type file_path(file_pathSEXP); + Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP); + Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP); + Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP); + Rcpp::traits::input_parameter< const int >::type simplify_to(simplify_toSEXP); + Rcpp::traits::input_parameter< const int >::type type_policy(type_policySEXP); + Rcpp::traits::input_parameter< const int >::type int64_r_type(int64_r_typeSEXP); + rcpp_result_gen = Rcpp::wrap(load_json(file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)); + return rcpp_result_gen; +END_RCPP +} // exceptions_enabled bool exceptions_enabled(); RcppExport SEXP _RcppSimdJson_exceptions_enabled() { @@ -85,6 +102,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_RcppSimdJson_deserialize_json", (DL_FUNC) &_RcppSimdJson_deserialize_json, 7}, + {"_RcppSimdJson_load_json", (DL_FUNC) &_RcppSimdJson_load_json, 7}, {"_RcppSimdJson_exceptions_enabled", (DL_FUNC) &_RcppSimdJson_exceptions_enabled, 0}, {"_RcppSimdJson_check_int64", (DL_FUNC) &_RcppSimdJson_check_int64, 0}, {"_RcppSimdJson_validateJSON", (DL_FUNC) &_RcppSimdJson_validateJSON, 1}, From c12be19ecd56d1e3660da057ef197890ffad7210 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 19:46:51 -0700 Subject: [PATCH 13/16] fix bad includes --- inst/include/RcppSimdJson/common.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp index 1261d36..15a3a49 100644 --- a/inst/include/RcppSimdJson/common.hpp +++ b/inst/include/RcppSimdJson/common.hpp @@ -126,8 +126,8 @@ enum class Simplify_To : int { } // namespace rcppsimdjson -#include -#include "RcppSimdJson/utils.hpp" +#include "../simdjson.h" +#include "utils.hpp" namespace rcppsimdjson { From c8e5026c03222de5527df0ed5745c070ae2110a0 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Mon, 15 Jun 2020 20:24:42 -0700 Subject: [PATCH 14/16] fix line deletion missing from commit 70adc7b24faa90036343f77c9adf2568ce682882 --- inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp index f3909d5..70bfacc 100644 --- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp +++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp @@ -222,7 +222,7 @@ inline constexpr auto Type_Doctor::common_R_type() co if (chr_ && !(dbl_ || i64_ || i32_ || lgl_ || u64_)) { return rcpp_T::chr; } - + if (dbl_ && !(lgl_ || u64_)) { // any number will become double return rcpp_T::dbl; } From 4bb2cec129792be1e801fb3b817cefd4e48119b3 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Tue, 16 Jun 2020 17:31:32 -0700 Subject: [PATCH 15/16] revert to previous simdjson (Wed May 20 10:23:07 EDT 2020) --- inst/include/simdjson.cpp | 5847 +++++++++++++++++-------------------- inst/include/simdjson.h | 829 +++--- 2 files changed, 3099 insertions(+), 3577 deletions(-) diff --git a/inst/include/simdjson.cpp b/inst/include/simdjson.cpp index d99dc8b..a2d815f 100644 --- a/inst/include/simdjson.cpp +++ b/inst/include/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */ +/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */ /* begin file src/simdjson.cpp */ #include "simdjson.h" @@ -12,6 +12,7 @@ namespace internal { SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] { { SUCCESS, "No error" }, + { SUCCESS_AND_HAS_MORE, "No error and buffer still has more data" }, { CAPACITY, "This parser can't support a document that big" }, { MEMALLOC, "Error allocating memory, we're most likely out of memory" }, { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." }, @@ -358,6 +359,8 @@ static const uint64_t thintable_epi8[256] = { namespace simdjson { namespace haswell { +using namespace simdjson::dom; + class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation( @@ -365,12 +368,11 @@ class implementation final : public simdjson::implementation { "Intel/AMD AVX2", instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2 ) {} - WARN_UNUSED error_code create_dom_parser_implementation( - size_t capacity, - size_t max_length, - std::unique_ptr& dst - ) const noexcept final; + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace haswell @@ -396,12 +398,11 @@ using namespace simdjson::dom; class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {} - WARN_UNUSED error_code create_dom_parser_implementation( - size_t capacity, - size_t max_length, - std::unique_ptr& dst - ) const noexcept final; + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace westmere @@ -427,12 +428,11 @@ using namespace simdjson::dom; class implementation final : public simdjson::implementation { public: really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {} - WARN_UNUSED error_code create_dom_parser_implementation( - size_t capacity, - size_t max_length, - std::unique_ptr& dst - ) const noexcept final; + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace arm64 @@ -462,12 +462,11 @@ class implementation final : public simdjson::implementation { "Generic fallback implementation", 0 ) {} - WARN_UNUSED error_code create_dom_parser_implementation( - size_t capacity, - size_t max_length, - std::unique_ptr& dst - ) const noexcept final; + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final; WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final; + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final; + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final; }; } // namespace fallback @@ -490,16 +489,21 @@ class detect_best_supported_implementation_on_first_use final : public implement const std::string &name() const noexcept final { return set_best()->name(); } const std::string &description() const noexcept final { return set_best()->description(); } uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); } - WARN_UNUSED error_code create_dom_parser_implementation( - size_t capacity, - size_t max_length, - std::unique_ptr& dst - ) const noexcept final { - return set_best()->create_dom_parser_implementation(capacity, max_length, dst); + WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final { + return set_best()->parse(buf, len, parser); } WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final { return set_best()->minify(buf, len, dst, dst_len); } + WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept final { + return set_best()->stage1(buf, len, parser, streaming); + } + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final { + return set_best()->stage2(buf, len, parser); + } + WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept final { + return set_best()->stage2(buf, len, parser, next_json); + } really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {} private: @@ -528,16 +532,21 @@ const std::initializer_list available_implementation_poi // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support class unsupported_implementation final : public implementation { public: - WARN_UNUSED error_code create_dom_parser_implementation( - size_t, - size_t, - std::unique_ptr& - ) const noexcept final { + WARN_UNUSED error_code parse(const uint8_t *, size_t, dom::parser &) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final { return UNSUPPORTED_ARCHITECTURE; } + WARN_UNUSED error_code stage1(const uint8_t *, size_t, dom::parser &, bool) const noexcept final { + return UNSUPPORTED_ARCHITECTURE; + } + WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &) const noexcept final { + return UNSUPPORTED_ARCHITECTURE; + } + WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &, size_t &) const noexcept final { + return UNSUPPORTED_ARCHITECTURE; + } unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {} }; @@ -1933,151 +1942,7 @@ const uint64_t mantissa_128[] = { /* simdprune_tables.h already included: #include "simdprune_tables.h" */ #if SIMDJSON_IMPLEMENTATION_ARM64 -/* begin file src/arm64/implementation.cpp */ -/* arm64/implementation.h already included: #include "arm64/implementation.h" */ -/* begin file src/arm64/dom_parser_implementation.h */ -#ifndef SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H -#define SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H - -/* isadetection.h already included: #include "isadetection.h" */ - -namespace simdjson { -namespace arm64 { - -/* begin file src/generic/dom_parser_implementation.h */ -// expectation: sizeof(scope_descriptor) = 64/8. -struct scope_descriptor { - uint32_t tape_index; // where, on the tape, does the scope ([,{) begins - uint32_t count; // how many elements in the scope -}; // struct scope_descriptor - -#ifdef SIMDJSON_USE_COMPUTED_GOTO -typedef void* ret_address_t; -#else -typedef char ret_address_t; -#endif - -class dom_parser_implementation final : public internal::dom_parser_implementation { -public: - /** Tape location of each open { or [ */ - std::unique_ptr containing_scope{}; - /** Return address of each open { or [ */ - std::unique_ptr ret_address{}; - /** Buffer passed to stage 1 */ - const uint8_t *buf{}; - /** Length passed to stage 1 */ - size_t len{0}; - /** Document passed to stage 2 */ - dom::document *doc{}; - /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ - error_code error{UNINITIALIZED}; - - really_inline dom_parser_implementation(); - dom_parser_implementation(const dom_parser_implementation &) = delete; - dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; - - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; - WARN_UNUSED error_code check_for_unclosed_array() noexcept; - WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; - WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; - WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; - WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; -}; - -/* begin file src/generic/stage1/allocate.h */ -namespace stage1 { -namespace allocate { - -// -// Allocates stage 1 internal state and outputs in the parser -// -really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { - size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; - parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); - if (!parser.structural_indexes) { return MEMALLOC; } - parser.structural_indexes[0] = 0; - parser.n_structural_indexes = 0; - return SUCCESS; -} - -} // namespace allocate -} // namespace stage1 -/* end file src/generic/stage1/allocate.h */ -/* begin file src/generic/stage2/allocate.h */ -namespace stage2 { -namespace allocate { - -// -// Allocates stage 2 internal state and outputs in the parser -// -really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { - parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); - parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); - - if (!parser.ret_address || !parser.containing_scope) { - return MEMALLOC; - } - return SUCCESS; -} - -} // namespace allocate -} // namespace stage2 -/* end file src/generic/stage2/allocate.h */ - -really_inline dom_parser_implementation::dom_parser_implementation() {} - -// Leaving these here so they can be inlined if so desired -WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { - error_code err = stage1::allocate::set_capacity(*this, capacity); - if (err) { _capacity = 0; return err; } - _capacity = capacity; - return SUCCESS; -} - -WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { - error_code err = stage2::allocate::set_max_depth(*this, max_depth); - if (err) { _max_depth = 0; return err; } - _max_depth = max_depth; - return SUCCESS; -} -/* end file src/generic/stage2/allocate.h */ - -} // namespace arm64 -} // namespace simdjson - -#endif // SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H -/* end file src/generic/stage2/allocate.h */ - -TARGET_HASWELL - -namespace simdjson { -namespace arm64 { - -WARN_UNUSED error_code implementation::create_dom_parser_implementation( - size_t capacity, - size_t max_depth, - std::unique_ptr& dst -) const noexcept { - dst.reset( new (std::nothrow) dom_parser_implementation() ); - if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); - return SUCCESS; -} - -} // namespace arm64 -} // namespace simdjson - -UNTARGET_REGION -/* end file src/generic/stage2/allocate.h */ -/* begin file src/arm64/dom_parser_implementation.cpp */ -/* arm64/implementation.h already included: #include "arm64/implementation.h" */ -/* arm64/dom_parser_implementation.h already included: #include "arm64/dom_parser_implementation.h" */ - -// -// Stage 1 -// +/* begin file src/arm64/stage1.cpp */ /* begin file src/arm64/bitmask.h */ #ifndef SIMDJSON_ARM64_BITMASK_H #define SIMDJSON_ARM64_BITMASK_H @@ -2729,6 +2594,7 @@ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_ #endif // SIMDJSON_ARM64_SIMD_H /* end file src/arm64/bitmanipulation.h */ /* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */ +/* arm64/implementation.h already included: #include "arm64/implementation.h" */ namespace simdjson { namespace arm64 { @@ -2799,21 +2665,24 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - really_inline size_t block_index(); - really_inline bool has_full_block() const; - really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - really_inline size_t get_remainder(uint8_t *dst) const; - really_inline void advance(); + really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + really_inline size_t block_index() { return idx; } + really_inline bool has_full_block() const { + return idx < lenminusstep; + } + really_inline const uint8_t *full_block() const { + return &buf[idx]; + } + really_inline bool has_remainder() const { + return idx < len; + } + really_inline void get_remainder(uint8_t *tmp_buf) const { + memset(tmp_buf, 0x20, STEP_SIZE); + memcpy(tmp_buf, buf + idx, len - idx); + } + really_inline void advance() { + idx += STEP_SIZE; + } private: const uint8_t *buf; const size_t len; @@ -2821,18 +2690,6 @@ struct buf_block_reader { size_t idx; }; -constexpr const int TITLE_SIZE = 12; - -// Routines to print masks and text for debugging bitmask operations -UNUSED static char * format_input_text_64(const uint8_t *text) { - static char *buf = (char*)malloc(sizeof(simd8x64) + 1); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -2852,34 +2709,6 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } - -template -really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - -template -really_inline size_t buf_block_reader::block_index() { return idx; } - -template -really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} - -template -really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} - -template -really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - memcpy(dst, buf + idx, len - idx); - return len - idx; -} - -template -really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; -} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -3179,15 +3008,13 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); - - // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - // Index the last (remainder) block, padded with spaces - uint8_t block[STEP_SIZE]; - if (likely(reader.get_remainder(block)) > 0) { + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); minifier.step(block, reader); } @@ -3200,94 +3027,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } -/* begin file src/generic/stage1/find_next_document_index.h */ -/** - * This algorithm is used to quickly identify the last structural position that - * makes up a complete document. - * - * It does this by going backwards and finding the last *document boundary* (a - * place where one value follows another without a comma between them). If the - * last document (the characters after the boundary) has an equal number of - * start and end brackets, it is considered complete. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and that means we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete document, therefore the last json buffer location is the end of the - * batch. - */ -really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (parser.buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (parser.buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - // Last document is complete, so the next document will appear after! - if (!arr_cnt && !obj_cnt) { - return parser.n_structural_indexes; - } - // Last document is incomplete; mark the document at i + 1 as the next one - return i; - } - return 0; -} - -// Skip the last character if it is partial -really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { - if (unlikely(len < 3)) { - switch (len) { - case 2: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left - return len; - case 1: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return len; - case 0: - return len; - } - } - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left - if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left - return len; -} -/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -3338,9 +3077,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -3355,7 +3094,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { using namespace simd; namespace utf8_validation { - // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -3400,7 +3138,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -3431,6 +3169,214 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } + // + // Validate the length of multibyte characters (that each multibyte character has the right number + // of continuation characters, and that all continuation characters are part of a multibyte + // character). + // + // Algorithm + // ========= + // + // This algorithm compares *expected* continuation characters with *actual* continuation bytes, + // and emits an error anytime there is a mismatch. + // + // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte + // characters, the file will look like this: + // + // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | + // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| + // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | + // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | + // | is_second_byte | | X | | | | X | | | X | | | + // | is_third_byte | | | X | | | | X | | | | | + // | is_fourth_byte | | | | X | | | | | | | | + // | expected_continuation | | X | X | X | | X | X | | X | | | + // | is_continuation | | X | X | X | | X | X | | X | | | + // + // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): + // + // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not + // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just + // floating around extra outside of any character, or that there is an illegal 5-byte character, + // or maybe it's at the beginning of the file before any characters have started; but it's an + // error in all these cases. + // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means + // we started a new character before we were finished with the current one. + // + // Getting the Previous Bytes + // -------------------------- + // + // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte + // character, we need to "shift the bytes" to find that out. This is what they mean: + // + // - `is_continuation`: if the current byte is a continuation. + // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. + // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. + // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. + // + // We use shuffles to go n bytes back, selecting part of the current `input` and part of the + // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller + // function, because the 1-byte-back data is used by other checks as well. + // + // Getting the Continuation Mask + // ----------------------------- + // + // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as + // numbers, using signed `<` and `>` operations to check if they are continuations or leads. + // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because + // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). + // + // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," + // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. + // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. + // + // When treated as signed numbers, they look like this: + // + // | Type | High Bits | Binary Range | Signed | + // |--------------|------------|--------------|--------| + // | ASCII | `0` | `01111111` | 127 | + // | | | `00000000` | 0 | + // | 4+-Byte Lead | `1111` | `11111111` | -1 | + // | | | `11110000 | -16 | + // | 3-Byte Lead | `1110` | `11101111` | -17 | + // | | | `11100000 | -32 | + // | 2-Byte Lead | `110` | `11011111` | -33 | + // | | | `11000000 | -64 | + // | Continuation | `10` | `10111111` | -65 | + // | | | `10000000 | -128 | + // + // This makes it pretty easy to get the continuation mask! It's just a single comparison: + // + // ``` + // is_continuation = input < -64` + // ``` + // + // We can do something similar for the others, but it takes two comparisons instead of one: "is + // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and + // `> -64`. Surely we can do better, they're right next to each other! + // + // Getting the is_xxx Masks: Shifting the Range + // -------------------------------------------- + // + // Notice *why* continuations were a single comparison. The actual *range* would require two + // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get + // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be + // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. + // + // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps + // ASCII down into the negative, and puts 4+-Byte Lead at the top: + // + // | Type | High Bits | Binary Range | Signed | + // |----------------------|------------|--------------|-------| + // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | + // | | | `01110000 | 112 | + // |----------------------|------------|--------------|-------| + // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | + // | | | `01100000 | 96 | + // |----------------------|------------|--------------|-------| + // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | + // | | | `01000000 | 64 | + // |----------------------|------------|--------------|-------| + // | Continuation (+ 127) | `00` | `00111111` | 63 | + // | | | `00000000 | 0 | + // |----------------------|------------|--------------|-------| + // | ASCII (+ 127) | `1` | `11111111` | -1 | + // | | | `10000000` | -128 | + // |----------------------|------------|--------------|-------| + // + // *Now* we can use signed `>` on all of them: + // + // ``` + // prev1 = input.prev<1> + // prev2 = input.prev<2> + // prev3 = input.prev<3> + // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` + // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` + // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` + // is_second_byte = prev1_flipped > 63; // 2+-byte lead + // is_third_byte = prev2_flipped > 95; // 3+-byte lead + // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead + // ``` + // + // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number + // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 + // `^`'s at a time on Haswell, but only 2 `+`'s). + // + // That doesn't look like it saved us any instructions, did it? Well, because we're adding the + // same number to all of them, we can save one of those `+ 128` operations by assembling + // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 + // to it. One more instruction saved! + // + // ``` + // prev1 = input.prev<1> + // prev3 = input.prev<3> + // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` + // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` + // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or + // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can + // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and + // then adds the result together. Same number of operations, but if the processor can run + // independent things in parallel (which most can), it runs faster. + // + // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have + // a super nice advantage in that more of them can be run at the same time (they can run on 3 + // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, + // saving us the cycle we would have earned by using +. Even more, using an instruction with a + // wider array of ports can help *other* code run ahead, too, since these instructions can "get + // out of the way," running on a port other instructions can't. + // + // Epilogue II: One More Trick + // --------------------------- + // + // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay + // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in + // check_special_cases()--but we'll talk about that there :) + // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -3568,22 +3514,16 @@ class bit_indexer { class json_structural_indexer { public: - /** - * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. - * - * @param partial Setting the partial parameter to true allows the find_structural_bits to - * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If - * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. - */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes); + really_inline json_structural_indexer(uint32_t *structural_indexes) + : indexer{structural_indexes} {} template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); json_scanner scanner{}; utf8_checker checker{}; @@ -3592,44 +3532,42 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} -// -// PERF NOTES: -// We pipe 2 inputs through these stages: -// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load -// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. -// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. -// The output of step 1 depends entirely on this information. These functions don't quite use -// up enough CPU: the second half of the functions is highly serial, only using 1 execution core -// at a time. The second input's scans has some dependency on the first ones finishing it, but -// they can make a lot of progress before they need that information. -// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that -// to finish: utf-8 checks and generating the output from the last iteration. -// -// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all -// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough -// workout. -// -template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { - if (unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } +really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); - buf_block_reader reader(buf, len); - json_structural_indexer indexer(parser.structural_indexes.get()); + error_code error = scanner.finish(streaming); + if (unlikely(error != SUCCESS)) { return error; } - // Read all but the last block - while (reader.has_full_block()) { - indexer.step(reader.full_block(), reader); + if (unescaped_chars_error) { + return UNESCAPED_CHARS; } - // Take care of the last block (will always be there unless file is empty) - uint8_t block[STEP_SIZE]; - if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } - indexer.step(block, reader); - - return indexer.finish(parser, reader.block_index(), len, partial); + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /* a valid JSON file cannot have zero structural indexes - we should have + * found something */ + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { + /* the string might not be NULL terminated, but we add a virtual NULL + * ending character. */ + parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); + } + /* make it safe to dereference one beyond this array */ + parser.structural_indexes[parser.n_structural_indexes] = 0; + return checker.errors(); } template<> @@ -3651,76 +3589,61 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b reader.advance(); } -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} - -really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); - - error_code error = scanner.finish(partial); - if (unlikely(error != SUCCESS)) { return error; } +// +// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. +// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, +// you may want to call on a function like trimmed_length_safe_utf8. +template +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { + if (unlikely(len > parser.capacity())) { return CAPACITY; } - if (unescaped_chars_error) { - return UNESCAPED_CHARS; + buf_block_reader reader(buf, len); + json_structural_indexer indexer(parser.structural_indexes.get()); + while (reader.has_full_block()) { + indexer.step(reader.full_block(), reader); } - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /*** - * This is related to https://github.com/simdjson/simdjson/issues/906 - * Basically, we want to make sure that if the parsing continues beyond the last (valid) - * structural character, it quickly stops. - * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. - * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing - * continues, then it must be [,] or }. - * Suppose it is ] or }. We backtrack to the first character, what could it be that would - * not trigger an error? It could be ] or } but no, because you can't start a document that way. - * It can't be a comma, a colon or any simple value. So the only way we could continue is - * if the repeated character is [. But if so, the document must start with [. But if the document - * starts with [, it should end with ]. If we enforce that rule, then we would get - * ][[ which is invalid. - **/ - parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 2] = 0; - parser.next_structural_index = 0; - // a valid JSON file cannot have zero structural indexes - we should have found something - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); + indexer.step(block, reader); } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } - if (partial) { - auto new_structural_indexes = find_next_document_index(parser); - if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. - } - parser.n_structural_indexes = new_structural_indexes; - } - return checker.errors(); + + return indexer.finish(parser, reader.block_index(), len, streaming); } } // namespace stage1 /* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { - this->buf = _buf; - this->len = _len; - return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming); +WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { + return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); } } // namespace arm64 } // namespace simdjson +/* end file src/generic/stage1/json_structural_indexer.h */ +/* begin file src/arm64/stage2.cpp */ +#ifndef SIMDJSON_ARM64_STAGE2_H +#define SIMDJSON_ARM64_STAGE2_H -// -// Stage 2 -// - +/* arm64/implementation.h already included: #include "arm64/implementation.h" */ /* begin file src/arm64/stringparsing.h */ #ifndef SIMDJSON_ARM64_STRINGPARSING_H #define SIMDJSON_ARM64_STRINGPARSING_H @@ -4126,10 +4049,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 x + // can't fit in binary64. The maximal value is about 1.7976931348623157 × // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe. The estimate for the total number + // are about 10^80 atoms in the universe.  The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -4149,6 +4072,26 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } +// We need to check that the character following a zero is valid. This is +// probably frequent and it is harder than it looks. We are building all of this +// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... +const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +really_inline bool +is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { + return structural_or_whitespace_or_exponent_or_decimal_negated[c]; +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -4226,14 +4169,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -4246,12 +4189,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.append_s64(i); + writer.write_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.append_u64(i); + writer.write_u64(i); } } return is_structural_or_whitespace(*p); @@ -4261,7 +4204,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -4285,10 +4228,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W &writer) { + W writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.append_s64(0); // always write zero + writer.write_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -4308,7 +4251,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_integer(*p)) { + if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -4432,7 +4375,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (unlikely(digit_count >= 19)) { + if (digit_count >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -4440,22 +4383,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer when we passed it to the - // slow_float_parsing() function, so we have to skip those tape spots now that we've returned - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -4464,7 +4399,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -4479,14 +4414,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - bool success = parse_large_integer(src, writer, found_minus); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_large_integer(); - return success; + return parse_large_integer(src, writer, found_minus); } i = negative ? 0 - i : i; - writer.append_s64(i); + writer.write_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -4508,72 +4439,6 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, namespace simdjson { namespace arm64 { -/* begin file src/generic/stage2/logger.h */ -// This is for an internal-only stage 2 specific logger. -// Set LOG_ENABLED = true to log what stage 2 is doing! -namespace logger { - static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; - - static constexpr const bool LOG_ENABLED = false; - static constexpr const int LOG_EVENT_LEN = 30; - static constexpr const int LOG_BUFFER_LEN = 20; - static constexpr const int LOG_DETAIL_LEN = 50; - static constexpr const int LOG_INDEX_LEN = 10; - - static int log_depth; // Not threadsafe. Log only. - - // Helper to turn unprintable or newline characters into spaces - static really_inline char printable_char(char c) { - if (c >= 0x20) { - return c; - } else { - return ' '; - } - } - - // Print the header and set up log_start - static really_inline void log_start() { - if (LOG_ENABLED) { - log_depth = 0; - printf("\n"); - printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); - printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); - } - } - - static really_inline void log_string(const char *message) { - if (LOG_ENABLED) { - printf("%s\n", message); - } - } - - // Logs a single line of - template - static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { - if (LOG_ENABLED) { - printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); - { - // Print the next N characters in the buffer. - printf("| "); - // Otherwise, print the characters starting from the buffer position. - // Print spaces for unprintable or newline characters. - for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -4676,25 +4533,32 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, parser.len); - memset(copy + parser.len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), *current_structural); + memcpy(copy, buf, len); + memset(copy + len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), idx); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return current_structural >= &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 > n_structural_indexes; } really_inline bool at_end(uint32_t n_structural_indexes) { - return current_structural == &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 == n_structural_indexes; } - really_inline bool at_beginning() { - return current_structural == parser.structural_indexes.get(); + really_inline size_t next_structural_index() { + return next_structural; } + + const uint8_t* const buf; + const size_t len; + const uint32_t* const structural_indexes; + size_t next_structural; // next structural index + size_t idx{0}; // location of the structural character in the input (buf) + uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -4706,105 +4570,8 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { -namespace { // Make everything here private - -/* begin file src/generic/stage2/tape_writer.h */ -struct tape_writer { - /** The next place to write to tape */ - uint64_t *next_tape_loc; - - /** Write a signed 64-bit value to tape. */ - really_inline void append_s64(int64_t value) noexcept; - - /** Write an unsigned 64-bit value to tape. */ - really_inline void append_u64(uint64_t value) noexcept; - - /** Write a double value to tape. */ - really_inline void append_double(double value) noexcept; - - /** - * Append a tape entry (an 8-bit type,and 56 bits worth of value). - */ - really_inline void append(uint64_t val, internal::tape_type t) noexcept; - - /** - * Skip the current tape entry without writing. - * - * Used to skip the start of the container, since we'll come back later to fill it in when the - * container ends. - */ - really_inline void skip() noexcept; - - /** - * Skip the number of tape entries necessary to write a large u64 or i64. - */ - really_inline void skip_large_integer() noexcept; - - /** - * Skip the number of tape entries necessary to write a double. - */ - really_inline void skip_double() noexcept; - - /** - * Write a value to a known location on tape. - * - * Used to go back and write out the start of a container after the container ends. - */ - really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; - -private: - /** - * Append both the tape entry, and a supplementary value following it. Used for types that need - * all 64 bits, such as double and uint64_t. - */ - template - really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; -}; // struct number_writer - -really_inline void tape_writer::append_s64(int64_t value) noexcept { - append2(0, value, internal::tape_type::INT64); -} - -really_inline void tape_writer::append_u64(uint64_t value) noexcept { - append(0, internal::tape_type::UINT64); - *next_tape_loc = value; - next_tape_loc++; -} - -/** Write a double value to tape. */ -really_inline void tape_writer::append_double(double value) noexcept { - append2(0, value, internal::tape_type::DOUBLE); -} -really_inline void tape_writer::skip() noexcept { - next_tape_loc++; -} - -really_inline void tape_writer::skip_large_integer() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::skip_double() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { - *next_tape_loc = val | ((uint64_t(char(t))) << 56); - next_tape_loc++; -} - -template -really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { - append(val, t); - static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); - memcpy(next_tape_loc, &val2, sizeof(val2)); - next_tape_loc++; -} - -really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { - tape_loc = val | ((uint64_t(char(t))) << 56); -} -/* end file src/generic/stage2/tape_writer.h */ +using internal::ret_address; #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -4835,88 +4602,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address_t array_begin; - ret_address_t array_continue; - ret_address_t error; - ret_address_t finish; - ret_address_t object_begin; - ret_address_t object_continue; + ret_address array_begin; + ret_address array_continue; + ret_address error; + ret_address finish; + ret_address object_begin; + ret_address object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct structural_parser : structural_iterator { - /** Lets you append to the tape */ - tape_writer tape; +struct number_writer { + parser &doc_parser; + + really_inline void write_s64(int64_t value) noexcept { + write_tape(0, internal::tape_type::INT64); + std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); + ++doc_parser.current_loc; + } + really_inline void write_u64(uint64_t value) noexcept { + write_tape(0, internal::tape_type::UINT64); + doc_parser.doc.tape[doc_parser.current_loc++] = value; + } + really_inline void write_double(double value) noexcept { + write_tape(0, internal::tape_type::DOUBLE); + static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); + memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); + // doc.tape[doc.current_loc++] = *((uint64_t *)&d); + } + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + } +}; // struct number_writer + +struct structural_parser { + structural_iterator structurals; + parser &doc_parser; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc; - /** Current depth (nested objects and arrays) */ - uint32_t depth{0}; - - // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations - really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) - : structural_iterator(_parser, start_structural_index), - tape{parser.doc->tape.get()}, - current_string_buf_loc{parser.doc->string_buf.get()} { - } - - WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc{}; + uint32_t depth; + + really_inline structural_parser( + const uint8_t *buf, + size_t len, + parser &_doc_parser, + uint32_t next_structural = 0 + ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} + + WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { + doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; + doc_parser.containing_scope[depth].count = 0; + write_tape(0, type); // if the document is correct, this gets rewritten later + doc_parser.ret_address[depth] = continue_state; depth++; - bool exceeded_max_depth = depth >= parser.max_depth(); - if (exceeded_max_depth) { log_error("Exceeded max depth!"); } - return exceeded_max_depth; + return depth >= doc_parser.max_depth(); } - WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { - log_start_value("document"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_document(ret_address continue_state) { + return start_scope(internal::tape_type::ROOT, continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { - log_start_value("object"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_object(ret_address continue_state) { + return start_scope(internal::tape_type::START_OBJECT, continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { - log_start_value("array"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_array(ret_address continue_state) { + return start_scope(internal::tape_type::START_ARRAY, continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { + really_inline void end_scope(internal::tape_type type) noexcept { depth--; - // write our doc->tape location to the header scope + // write our doc.tape location to the header scope // The root scope gets written *at* the previous location. - tape.append(parser.containing_scope[depth].tape_index, end); + write_tape(doc_parser.containing_scope[depth].tape_index, type); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; - const uint32_t count = parser.containing_scope[depth].count; + const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; + const uint32_t count = doc_parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); - } - - really_inline uint32_t next_tape_index() { - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); + // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] + doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); } really_inline void end_object() { - log_end_value("object"); - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + end_scope(internal::tape_type::END_OBJECT); } really_inline void end_array() { - log_end_value("array"); - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + end_scope(internal::tape_type::END_ARRAY); } really_inline void end_document() { - log_end_value("document"); - end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); + end_scope(internal::tape_type::ROOT); + } + + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); } // increment_count increments the count of keys in an object or values in an array. @@ -4924,16 +4705,17 @@ struct structural_parser : structural_iterator { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - // we advance the point, accounting for the fact that we have a NULL termination - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); + /* we advance the point, accounting for the fact that we have a NULL + * termination */ + write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline void on_end_string(uint8_t *dst) noexcept { + really_inline bool on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -4943,49 +4725,73 @@ struct structural_parser : structural_iterator { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; + return true; } - WARN_UNUSED really_inline bool parse_string(bool key = false) { - log_value(key ? "key" : "string"); + WARN_UNUSED really_inline bool parse_string() { uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(current(), dst); + dst = stringparsing::parse_string(structurals.current(), dst); if (dst == nullptr) { - log_error("Invalid escape in string"); return true; } - on_end_string(dst); - return false; + return !on_end_string(dst); } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - log_value("number"); - bool succeeded = numberparsing::parse_number(src, found_minus, tape); - if (!succeeded) { log_error("Invalid number"); } - return !succeeded; + number_writer writer{doc_parser}; + return !numberparsing::parse_number(src, found_minus, writer); } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(current(), found_minus); + return parse_number(structurals.current(), found_minus); + } + + WARN_UNUSED really_inline bool parse_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; } - WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { - switch (advance_char()) { + WARN_UNUSED really_inline bool parse_single_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; + } + + WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { + switch (structurals.current_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': - log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); - tape.append(0, internal::tape_type::TRUE_VALUE); - return continue_state; - case 'f': - log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); - tape.append(0, internal::tape_type::FALSE_VALUE); - return continue_state; - case 'n': - log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); - tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parse_atom() ); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -5001,27 +4807,40 @@ struct structural_parser : structural_iterator { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: - log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { + // the string might not be NULL terminated. + if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } end_document(); - parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); - if (depth != 0) { - log_error("Unclosed objects or arrays!"); - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); } - return SUCCESS; + return on_success(SUCCESS); + } + + really_inline error_code on_error(error_code new_error_code) noexcept { + doc_parser.error = new_error_code; + return new_error_code; + } + really_inline error_code on_success(error_code success_code) noexcept { + doc_parser.error = success_code; + doc_parser.valid = true; + return success_code; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by parser.init_stage2(), + /* We do not need the next line because this is done by doc_parser.init_stage2(), * pessimistically. - * parser.is_valid = false; + * doc_parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -5029,12 +4848,12 @@ struct structural_parser : structural_iterator { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= parser.max_depth()) { - return parser.error = DEPTH_ERROR; + if (depth >= doc_parser.max_depth()) { + return on_error(DEPTH_ERROR); } - switch (current_char()) { + switch (structurals.current_char()) { case '"': - return parser.error = STRING_ERROR; + return on_error(STRING_ERROR); case '0': case '1': case '2': @@ -5046,124 +4865,92 @@ struct structural_parser : structural_iterator { case '8': case '9': case '-': - return parser.error = NUMBER_ERROR; + return on_error(NUMBER_ERROR); case 't': - return parser.error = T_ATOM_ERROR; + return on_error(T_ATOM_ERROR); case 'n': - return parser.error = N_ATOM_ERROR; + return on_error(N_ATOM_ERROR); case 'f': - return parser.error = F_ATOM_ERROR; + return on_error(F_ATOM_ERROR); default: - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); } } really_inline void init() { - log_start(); - parser.error = UNINITIALIZED; + current_string_buf_loc = doc_parser.doc.string_buf.get(); + doc_parser.current_loc = 0; + doc_parser.valid = false; + doc_parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { - // If there are no structurals left, return EMPTY - if (at_end(parser.n_structural_indexes)) { - return parser.error = EMPTY; + WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { + init(); // sets is_valid to false + if (len > doc_parser.capacity()) { + return CAPACITY; } - - init(); + // Advance to the first character as soon as possible + structurals.advance_char(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return parser.error = DEPTH_ERROR; + return on_error(DEPTH_ERROR); } return SUCCESS; } - really_inline void log_value(const char *type) { - logger::log_line(*this, "", type, ""); - } - - static really_inline void log_start() { - logger::log_start(); - } - - really_inline void log_start_value(const char *type) { - logger::log_line(*this, "+", type, ""); - if (logger::LOG_ENABLED) { logger::log_depth++; } - } - - really_inline void log_end_value(const char *type) { - if (logger::LOG_ENABLED) { logger::log_depth--; } - logger::log_line(*this, "-", type, ""); - } - - really_inline void log_error(const char *error) { - logger::log_line(*this, "", "ERROR", error); + really_inline char advance_char() { + return structurals.advance_char(); } -}; // struct structural_parser +}; // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -template -WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { - dom_parser.doc = &doc; +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); - error_code result = parser.start(addresses.finish); + stage2::structural_parser parser(buf, len, doc_parser); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.current_char()) { + switch (parser.structurals.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); - // Make sure the outer array is closed before continuing; otherwise, there are ways we could get - // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (!STREAMING) { - if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - goto error; - } - } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': - parser.log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::TRUE_VALUE); - goto finish; - case 'f': - parser.log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::FALSE_VALUE); - goto finish; - case 'n': - parser.log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: - parser.log_error("Document starts with a non-value character"); goto error; } @@ -5174,45 +4961,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.parse_string() ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: - parser.log_error("Object does not start with a key"); goto error; } object_key_state: - if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } + FAIL_IF( parser.advance_char() != ':' ); + parser.advance_char(); GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: - parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.parser.ret_address[parser.depth] ); + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.peek_next_char() == ']') { - parser.advance_char(); + if (parser.advance_char() == ']') { parser.end_array(); goto scope_end; } @@ -5227,12 +5012,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case ',': parser.increment_count(); + parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: - parser.log_error("Missing comma between array values"); goto error; } @@ -5243,298 +5028,194 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p return parser.error(); } -} // namespace {} +WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { + error_code code = stage1(buf, len, doc_parser, false); + if (!code) { + code = stage2(buf, len, doc_parser); + } + return code; +} +/* end file src/generic/stage2/structural_parser.h */ +/* begin file src/generic/stage2/streaming_structural_parser.h */ +namespace stage2 { + +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} + + // override to add streaming + WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { + init(); // sets is_valid to false + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (start_document(finish_parser)) { + return on_error(DEPTH_ERROR); + } + return SUCCESS; + } + + // override to add streaming + WARN_UNUSED really_inline error_code finish() { + if ( structurals.past_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } + end_document(); + if (depth != 0) { + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); + } + bool finished = structurals.at_end(doc_parser.n_structural_indexes); + return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + } +}; + } // namespace stage2 /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - error_code result = stage2::parse_structurals(*this, _doc); +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { + static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); + stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } + // + // Read first value + // + switch (parser.structurals.current_char()) { + case '{': + FAIL_IF( parser.start_object(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.start_array(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], true); + }) + ); + goto finish; + default: + goto error; + } - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return error = TAPE_ERROR; +// +// Object parser parsers +// +object_begin: + switch (parser.advance_char()) { + case '"': { + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + } + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; } - return SUCCESS; -} +object_key_parser: + FAIL_IF( parser.advance_char() != ':' ); + parser.increment_count(); + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); -} -/* end file src/generic/stage2/tape_writer.h */ +object_continue: + switch (parser.advance_char()) { + case ',': + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; + } + +scope_end: + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + +// +// Array parser parsers +// +array_begin: + if (parser.advance_char() == ']') { + parser.end_array(); + goto scope_end; + } + parser.increment_count(); + +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); + +array_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + parser.advance_char(); + goto main_array_switch; + case ']': + parser.end_array(); + goto scope_end; + default: + goto error; + } + +finish: + next_json = parser.structurals.next_structural_index(); + return parser.finish(); -WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - error_code err = stage1(_buf, _len, false); - if (err) { return err; } - return stage2(_doc); +error: + return parser.error(); } +/* end file src/generic/stage2/streaming_structural_parser.h */ } // namespace arm64 } // namespace simdjson -/* end file src/generic/stage2/tape_writer.h */ + +#endif // SIMDJSON_ARM64_STAGE2_H +/* end file src/generic/stage2/streaming_structural_parser.h */ #endif #if SIMDJSON_IMPLEMENTATION_FALLBACK -/* begin file src/fallback/implementation.cpp */ +/* begin file src/fallback/stage1.cpp */ /* fallback/implementation.h already included: #include "fallback/implementation.h" */ -/* begin file src/fallback/dom_parser_implementation.h */ -#ifndef SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H -#define SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H - -/* isadetection.h already included: #include "isadetection.h" */ namespace simdjson { namespace fallback { +namespace stage1 { -/* begin file src/generic/dom_parser_implementation.h */ -// expectation: sizeof(scope_descriptor) = 64/8. -struct scope_descriptor { - uint32_t tape_index; // where, on the tape, does the scope ([,{) begins - uint32_t count; // how many elements in the scope -}; // struct scope_descriptor - -#ifdef SIMDJSON_USE_COMPUTED_GOTO -typedef void* ret_address_t; -#else -typedef char ret_address_t; -#endif - -class dom_parser_implementation final : public internal::dom_parser_implementation { +class structural_scanner { public: - /** Tape location of each open { or [ */ - std::unique_ptr containing_scope{}; - /** Return address of each open { or [ */ - std::unique_ptr ret_address{}; - /** Buffer passed to stage 1 */ - const uint8_t *buf{}; - /** Length passed to stage 1 */ - size_t len{0}; - /** Document passed to stage 2 */ - dom::document *doc{}; - /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ - error_code error{UNINITIALIZED}; - - really_inline dom_parser_implementation(); - dom_parser_implementation(const dom_parser_implementation &) = delete; - dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; - - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; - WARN_UNUSED error_code check_for_unclosed_array() noexcept; - WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; - WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; - WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; - WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; -}; -/* begin file src/generic/stage1/allocate.h */ -namespace stage1 { -namespace allocate { +really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming) + : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {} -// -// Allocates stage 1 internal state and outputs in the parser -// -really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { - size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; - parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); - if (!parser.structural_indexes) { return MEMALLOC; } - parser.structural_indexes[0] = 0; - parser.n_structural_indexes = 0; - return SUCCESS; -} - -} // namespace allocate -} // namespace stage1 -/* end file src/generic/stage1/allocate.h */ -/* begin file src/generic/stage2/allocate.h */ -namespace stage2 { -namespace allocate { - -// -// Allocates stage 2 internal state and outputs in the parser -// -really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { - parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); - parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); - - if (!parser.ret_address || !parser.containing_scope) { - return MEMALLOC; - } - return SUCCESS; -} - -} // namespace allocate -} // namespace stage2 -/* end file src/generic/stage2/allocate.h */ - -really_inline dom_parser_implementation::dom_parser_implementation() {} - -// Leaving these here so they can be inlined if so desired -WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { - error_code err = stage1::allocate::set_capacity(*this, capacity); - if (err) { _capacity = 0; return err; } - _capacity = capacity; - return SUCCESS; -} - -WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { - error_code err = stage2::allocate::set_max_depth(*this, max_depth); - if (err) { _max_depth = 0; return err; } - _max_depth = max_depth; - return SUCCESS; -} -/* end file src/generic/stage2/allocate.h */ - -} // namespace fallback -} // namespace simdjson - -#endif // SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H -/* end file src/generic/stage2/allocate.h */ - -TARGET_HASWELL - -namespace simdjson { -namespace fallback { - -WARN_UNUSED error_code implementation::create_dom_parser_implementation( - size_t capacity, - size_t max_depth, - std::unique_ptr& dst -) const noexcept { - dst.reset( new (std::nothrow) dom_parser_implementation() ); - if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); - return SUCCESS; -} - -} // namespace fallback -} // namespace simdjson - -UNTARGET_REGION -/* end file src/generic/stage2/allocate.h */ -/* begin file src/fallback/dom_parser_implementation.cpp */ -/* fallback/implementation.h already included: #include "fallback/implementation.h" */ -/* fallback/dom_parser_implementation.h already included: #include "fallback/dom_parser_implementation.h" */ - -// -// Stage 1 -// -namespace simdjson { -namespace fallback { -namespace stage1 { - -/* begin file src/generic/stage1/find_next_document_index.h */ -/** - * This algorithm is used to quickly identify the last structural position that - * makes up a complete document. - * - * It does this by going backwards and finding the last *document boundary* (a - * place where one value follows another without a comma between them). If the - * last document (the characters after the boundary) has an equal number of - * start and end brackets, it is considered complete. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and that means we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete document, therefore the last json buffer location is the end of the - * batch. - */ -really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (parser.buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (parser.buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - // Last document is complete, so the next document will appear after! - if (!arr_cnt && !obj_cnt) { - return parser.n_structural_indexes; - } - // Last document is incomplete; mark the document at i + 1 as the next one - return i; - } - return 0; -} - -// Skip the last character if it is partial -really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { - if (unlikely(len < 3)) { - switch (len) { - case 2: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left - return len; - case 1: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return len; - case 0: - return len; - } - } - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left - if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left - return len; -} -/* end file src/generic/stage1/find_next_document_index.h */ - -class structural_scanner { -public: - -really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial) - : buf{_parser.buf}, - next_structural_index{_parser.structural_indexes.get()}, - parser{_parser}, - len{static_cast(_parser.len)}, - partial{_partial} { -} - -really_inline void add_structural() { - *next_structural_index = idx; - next_structural_index++; +really_inline void add_structural() { + *next_structural_index = idx; + next_structural_index++; } really_inline bool is_continuation(uint8_t c) { @@ -5553,12 +5234,7 @@ really_inline void validate_utf8_character() { // 2-byte if ((buf[idx] & 0b00100000) == 0) { // missing continuation - if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { - if (idx+1 > len && partial) { idx = len; return; } - error = UTF8_ERROR; - idx++; - return; - } + if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; } // overlong: 1100000_ 10______ if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; } idx += 2; @@ -5568,12 +5244,7 @@ really_inline void validate_utf8_character() { // 3-byte if ((buf[idx] & 0b00010000) == 0) { // missing continuation - if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { - if (idx+2 > len && partial) { idx = len; return; } - error = UTF8_ERROR; - idx++; - return; - } + if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; } // overlong: 11100000 100_____ ________ if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; } // surrogates: U+D800-U+DFFF 11101101 101_____ @@ -5584,12 +5255,7 @@ really_inline void validate_utf8_character() { // 4-byte // missing continuation - if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { - if (idx+2 > len && partial) { idx = len; return; } - error = UTF8_ERROR; - idx++; - return; - } + if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; } // overlong: 11110000 1000____ ________ ________ if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; } // too large: > U+10FFFF: @@ -5614,7 +5280,7 @@ really_inline void validate_string() { idx++; } } - if (idx >= len && !partial) { error = UNCLOSED_STRING; } + if (idx >= len && !streaming) { error = UNCLOSED_STRING; } } really_inline bool is_whitespace_or_operator(uint8_t c) { @@ -5655,46 +5321,33 @@ really_inline error_code scan() { break; } } - *next_structural_index = len; - // We pad beyond. - // https://github.com/simdjson/simdjson/issues/906 - next_structural_index[1] = len; - next_structural_index[2] = 0; - parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get()); - parser.next_structural_index = 0; - - if (unlikely(parser.n_structural_indexes == 0)) { + if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) { return EMPTY; } - - if (partial) { - auto new_structural_indexes = find_next_document_index(parser); - if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. - } - parser.n_structural_indexes = new_structural_indexes; - } - + *next_structural_index = len; + next_structural_index++; + doc_parser.n_structural_indexes = uint32_t(next_structural_index - doc_parser.structural_indexes.get()); return error; } private: const uint8_t *buf; uint32_t *next_structural_index; - dom_parser_implementation &parser; + parser &doc_parser; + uint32_t idx; uint32_t len; - uint32_t idx{0}; - error_code error{SUCCESS}; - bool partial; + error_code error; + bool streaming; }; // structural_scanner } // namespace stage1 -WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept { - this->buf = _buf; - this->len = _len; - stage1::structural_scanner scanner(*this, partial); +WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { + if (unlikely(len > parser.capacity())) { + return CAPACITY; + } + stage1::structural_scanner scanner(buf, uint32_t(len), parser, streaming); return scanner.scan(); } @@ -5756,10 +5409,10 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui } // namespace fallback } // namespace simdjson +/* end file src/fallback/stage1.cpp */ +/* begin file src/fallback/stage2.cpp */ -// -// Stage 2 -// +/* fallback/implementation.h already included: #include "fallback/implementation.h" */ /* begin file src/fallback/stringparsing.h */ #ifndef SIMDJSON_FALLBACK_STRINGPARSING_H #define SIMDJSON_FALLBACK_STRINGPARSING_H @@ -6219,10 +5872,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 x + // can't fit in binary64. The maximal value is about 1.7976931348623157 × // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe. The estimate for the total number + // are about 10^80 atoms in the universe.  The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -6242,6 +5895,26 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } +// We need to check that the character following a zero is valid. This is +// probably frequent and it is harder than it looks. We are building all of this +// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... +const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +really_inline bool +is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { + return structural_or_whitespace_or_exponent_or_decimal_negated[c]; +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -6319,14 +5992,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -6339,12 +6012,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.append_s64(i); + writer.write_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.append_u64(i); + writer.write_u64(i); } } return is_structural_or_whitespace(*p); @@ -6354,7 +6027,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -6378,10 +6051,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W &writer) { + W writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.append_s64(0); // always write zero + writer.write_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -6401,7 +6074,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_integer(*p)) { + if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -6525,7 +6198,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (unlikely(digit_count >= 19)) { + if (digit_count >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -6533,22 +6206,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer when we passed it to the - // slow_float_parsing() function, so we have to skip those tape spots now that we've returned - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -6557,7 +6222,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -6572,14 +6237,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - bool success = parse_large_integer(src, writer, found_minus); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_large_integer(); - return success; + return parse_large_integer(src, writer, found_minus); } i = negative ? 0 - i : i; - writer.append_s64(i); + writer.write_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -6602,72 +6263,6 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, namespace simdjson { namespace fallback { -/* begin file src/generic/stage2/logger.h */ -// This is for an internal-only stage 2 specific logger. -// Set LOG_ENABLED = true to log what stage 2 is doing! -namespace logger { - static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; - - static constexpr const bool LOG_ENABLED = false; - static constexpr const int LOG_EVENT_LEN = 30; - static constexpr const int LOG_BUFFER_LEN = 20; - static constexpr const int LOG_DETAIL_LEN = 50; - static constexpr const int LOG_INDEX_LEN = 10; - - static int log_depth; // Not threadsafe. Log only. - - // Helper to turn unprintable or newline characters into spaces - static really_inline char printable_char(char c) { - if (c >= 0x20) { - return c; - } else { - return ' '; - } - } - - // Print the header and set up log_start - static really_inline void log_start() { - if (LOG_ENABLED) { - log_depth = 0; - printf("\n"); - printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); - printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); - } - } - - static really_inline void log_string(const char *message) { - if (LOG_ENABLED) { - printf("%s\n", message); - } - } - - // Logs a single line of - template - static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { - if (LOG_ENABLED) { - printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); - { - // Print the next N characters in the buffer. - printf("| "); - // Otherwise, print the characters starting from the buffer position. - // Print spaces for unprintable or newline characters. - for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -6770,25 +6357,32 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, parser.len); - memset(copy + parser.len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), *current_structural); + memcpy(copy, buf, len); + memset(copy + len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), idx); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return current_structural >= &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 > n_structural_indexes; } really_inline bool at_end(uint32_t n_structural_indexes) { - return current_structural == &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 == n_structural_indexes; } - really_inline bool at_beginning() { - return current_structural == parser.structural_indexes.get(); + really_inline size_t next_structural_index() { + return next_structural; } + + const uint8_t* const buf; + const size_t len; + const uint32_t* const structural_indexes; + size_t next_structural; // next structural index + size_t idx{0}; // location of the structural character in the input (buf) + uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -6800,105 +6394,8 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { -namespace { // Make everything here private - -/* begin file src/generic/stage2/tape_writer.h */ -struct tape_writer { - /** The next place to write to tape */ - uint64_t *next_tape_loc; - - /** Write a signed 64-bit value to tape. */ - really_inline void append_s64(int64_t value) noexcept; - - /** Write an unsigned 64-bit value to tape. */ - really_inline void append_u64(uint64_t value) noexcept; - - /** Write a double value to tape. */ - really_inline void append_double(double value) noexcept; - - /** - * Append a tape entry (an 8-bit type,and 56 bits worth of value). - */ - really_inline void append(uint64_t val, internal::tape_type t) noexcept; - - /** - * Skip the current tape entry without writing. - * - * Used to skip the start of the container, since we'll come back later to fill it in when the - * container ends. - */ - really_inline void skip() noexcept; - - /** - * Skip the number of tape entries necessary to write a large u64 or i64. - */ - really_inline void skip_large_integer() noexcept; - - /** - * Skip the number of tape entries necessary to write a double. - */ - really_inline void skip_double() noexcept; - - /** - * Write a value to a known location on tape. - * - * Used to go back and write out the start of a container after the container ends. - */ - really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; - -private: - /** - * Append both the tape entry, and a supplementary value following it. Used for types that need - * all 64 bits, such as double and uint64_t. - */ - template - really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; -}; // struct number_writer - -really_inline void tape_writer::append_s64(int64_t value) noexcept { - append2(0, value, internal::tape_type::INT64); -} - -really_inline void tape_writer::append_u64(uint64_t value) noexcept { - append(0, internal::tape_type::UINT64); - *next_tape_loc = value; - next_tape_loc++; -} - -/** Write a double value to tape. */ -really_inline void tape_writer::append_double(double value) noexcept { - append2(0, value, internal::tape_type::DOUBLE); -} - -really_inline void tape_writer::skip() noexcept { - next_tape_loc++; -} - -really_inline void tape_writer::skip_large_integer() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::skip_double() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { - *next_tape_loc = val | ((uint64_t(char(t))) << 56); - next_tape_loc++; -} - -template -really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { - append(val, t); - static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); - memcpy(next_tape_loc, &val2, sizeof(val2)); - next_tape_loc++; -} -really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { - tape_loc = val | ((uint64_t(char(t))) << 56); -} -/* end file src/generic/stage2/tape_writer.h */ +using internal::ret_address; #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -6929,88 +6426,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address_t array_begin; - ret_address_t array_continue; - ret_address_t error; - ret_address_t finish; - ret_address_t object_begin; - ret_address_t object_continue; + ret_address array_begin; + ret_address array_continue; + ret_address error; + ret_address finish; + ret_address object_begin; + ret_address object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct structural_parser : structural_iterator { - /** Lets you append to the tape */ - tape_writer tape; +struct number_writer { + parser &doc_parser; + + really_inline void write_s64(int64_t value) noexcept { + write_tape(0, internal::tape_type::INT64); + std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); + ++doc_parser.current_loc; + } + really_inline void write_u64(uint64_t value) noexcept { + write_tape(0, internal::tape_type::UINT64); + doc_parser.doc.tape[doc_parser.current_loc++] = value; + } + really_inline void write_double(double value) noexcept { + write_tape(0, internal::tape_type::DOUBLE); + static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); + memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); + // doc.tape[doc.current_loc++] = *((uint64_t *)&d); + } + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + } +}; // struct number_writer + +struct structural_parser { + structural_iterator structurals; + parser &doc_parser; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc; - /** Current depth (nested objects and arrays) */ - uint32_t depth{0}; - - // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations - really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) - : structural_iterator(_parser, start_structural_index), - tape{parser.doc->tape.get()}, - current_string_buf_loc{parser.doc->string_buf.get()} { - } - - WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc{}; + uint32_t depth; + + really_inline structural_parser( + const uint8_t *buf, + size_t len, + parser &_doc_parser, + uint32_t next_structural = 0 + ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} + + WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { + doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; + doc_parser.containing_scope[depth].count = 0; + write_tape(0, type); // if the document is correct, this gets rewritten later + doc_parser.ret_address[depth] = continue_state; depth++; - bool exceeded_max_depth = depth >= parser.max_depth(); - if (exceeded_max_depth) { log_error("Exceeded max depth!"); } - return exceeded_max_depth; + return depth >= doc_parser.max_depth(); } - WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { - log_start_value("document"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_document(ret_address continue_state) { + return start_scope(internal::tape_type::ROOT, continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { - log_start_value("object"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_object(ret_address continue_state) { + return start_scope(internal::tape_type::START_OBJECT, continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { - log_start_value("array"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_array(ret_address continue_state) { + return start_scope(internal::tape_type::START_ARRAY, continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { + really_inline void end_scope(internal::tape_type type) noexcept { depth--; - // write our doc->tape location to the header scope + // write our doc.tape location to the header scope // The root scope gets written *at* the previous location. - tape.append(parser.containing_scope[depth].tape_index, end); + write_tape(doc_parser.containing_scope[depth].tape_index, type); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; - const uint32_t count = parser.containing_scope[depth].count; + const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; + const uint32_t count = doc_parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); - } - - really_inline uint32_t next_tape_index() { - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); + // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] + doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); } really_inline void end_object() { - log_end_value("object"); - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + end_scope(internal::tape_type::END_OBJECT); } really_inline void end_array() { - log_end_value("array"); - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + end_scope(internal::tape_type::END_ARRAY); } really_inline void end_document() { - log_end_value("document"); - end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); + end_scope(internal::tape_type::ROOT); + } + + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); } // increment_count increments the count of keys in an object or values in an array. @@ -7018,16 +6529,17 @@ struct structural_parser : structural_iterator { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - // we advance the point, accounting for the fact that we have a NULL termination - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); + /* we advance the point, accounting for the fact that we have a NULL + * termination */ + write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline void on_end_string(uint8_t *dst) noexcept { + really_inline bool on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -7037,49 +6549,73 @@ struct structural_parser : structural_iterator { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; + return true; } - WARN_UNUSED really_inline bool parse_string(bool key = false) { - log_value(key ? "key" : "string"); + WARN_UNUSED really_inline bool parse_string() { uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(current(), dst); + dst = stringparsing::parse_string(structurals.current(), dst); if (dst == nullptr) { - log_error("Invalid escape in string"); return true; } - on_end_string(dst); - return false; + return !on_end_string(dst); } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - log_value("number"); - bool succeeded = numberparsing::parse_number(src, found_minus, tape); - if (!succeeded) { log_error("Invalid number"); } - return !succeeded; + number_writer writer{doc_parser}; + return !numberparsing::parse_number(src, found_minus, writer); } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(current(), found_minus); + return parse_number(structurals.current(), found_minus); + } + + WARN_UNUSED really_inline bool parse_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; + } + + WARN_UNUSED really_inline bool parse_single_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; } - WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { - switch (advance_char()) { + WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { + switch (structurals.current_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': - log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); - tape.append(0, internal::tape_type::TRUE_VALUE); - return continue_state; - case 'f': - log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); - tape.append(0, internal::tape_type::FALSE_VALUE); - return continue_state; - case 'n': - log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); - tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parse_atom() ); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -7095,27 +6631,40 @@ struct structural_parser : structural_iterator { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: - log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { + // the string might not be NULL terminated. + if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } end_document(); - parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); - if (depth != 0) { - log_error("Unclosed objects or arrays!"); - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); } - return SUCCESS; + return on_success(SUCCESS); + } + + really_inline error_code on_error(error_code new_error_code) noexcept { + doc_parser.error = new_error_code; + return new_error_code; + } + really_inline error_code on_success(error_code success_code) noexcept { + doc_parser.error = success_code; + doc_parser.valid = true; + return success_code; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by parser.init_stage2(), + /* We do not need the next line because this is done by doc_parser.init_stage2(), * pessimistically. - * parser.is_valid = false; + * doc_parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -7123,12 +6672,12 @@ struct structural_parser : structural_iterator { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= parser.max_depth()) { - return parser.error = DEPTH_ERROR; + if (depth >= doc_parser.max_depth()) { + return on_error(DEPTH_ERROR); } - switch (current_char()) { + switch (structurals.current_char()) { case '"': - return parser.error = STRING_ERROR; + return on_error(STRING_ERROR); case '0': case '1': case '2': @@ -7140,124 +6689,92 @@ struct structural_parser : structural_iterator { case '8': case '9': case '-': - return parser.error = NUMBER_ERROR; + return on_error(NUMBER_ERROR); case 't': - return parser.error = T_ATOM_ERROR; + return on_error(T_ATOM_ERROR); case 'n': - return parser.error = N_ATOM_ERROR; + return on_error(N_ATOM_ERROR); case 'f': - return parser.error = F_ATOM_ERROR; + return on_error(F_ATOM_ERROR); default: - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); } } really_inline void init() { - log_start(); - parser.error = UNINITIALIZED; + current_string_buf_loc = doc_parser.doc.string_buf.get(); + doc_parser.current_loc = 0; + doc_parser.valid = false; + doc_parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { - // If there are no structurals left, return EMPTY - if (at_end(parser.n_structural_indexes)) { - return parser.error = EMPTY; + WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { + init(); // sets is_valid to false + if (len > doc_parser.capacity()) { + return CAPACITY; } - - init(); + // Advance to the first character as soon as possible + structurals.advance_char(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return parser.error = DEPTH_ERROR; + return on_error(DEPTH_ERROR); } return SUCCESS; } - really_inline void log_value(const char *type) { - logger::log_line(*this, "", type, ""); - } - - static really_inline void log_start() { - logger::log_start(); - } - - really_inline void log_start_value(const char *type) { - logger::log_line(*this, "+", type, ""); - if (logger::LOG_ENABLED) { logger::log_depth++; } - } - - really_inline void log_end_value(const char *type) { - if (logger::LOG_ENABLED) { logger::log_depth--; } - logger::log_line(*this, "-", type, ""); - } - - really_inline void log_error(const char *error) { - logger::log_line(*this, "", "ERROR", error); + really_inline char advance_char() { + return structurals.advance_char(); } -}; // struct structural_parser +}; // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -template -WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { - dom_parser.doc = &doc; +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); - error_code result = parser.start(addresses.finish); + stage2::structural_parser parser(buf, len, doc_parser); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.current_char()) { + switch (parser.structurals.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); - // Make sure the outer array is closed before continuing; otherwise, there are ways we could get - // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (!STREAMING) { - if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - goto error; - } - } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': - parser.log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::TRUE_VALUE); - goto finish; - case 'f': - parser.log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::FALSE_VALUE); - goto finish; - case 'n': - parser.log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: - parser.log_error("Document starts with a non-value character"); goto error; } @@ -7268,45 +6785,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.parse_string() ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: - parser.log_error("Object does not start with a key"); goto error; } object_key_state: - if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } + FAIL_IF( parser.advance_char() != ':' ); + parser.advance_char(); GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: - parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.parser.ret_address[parser.depth] ); + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.peek_next_char() == ']') { - parser.advance_char(); + if (parser.advance_char() == ']') { parser.end_array(); goto scope_end; } @@ -7321,12 +6836,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case ',': parser.increment_count(); + parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: - parser.log_error("Missing comma between array values"); goto error; } @@ -7337,191 +6852,178 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p return parser.error(); } -} // namespace {} -} // namespace stage2 +WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { + error_code code = stage1(buf, len, doc_parser, false); + if (!code) { + code = stage2(buf, len, doc_parser); + } + return code; +} +/* end file src/generic/stage2/structural_parser.h */ +/* begin file src/generic/stage2/streaming_structural_parser.h */ +namespace stage2 { -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - error_code result = stage2::parse_structurals(*this, _doc); - if (result) { return result; } +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return error = TAPE_ERROR; + // override to add streaming + WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { + init(); // sets is_valid to false + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (start_document(finish_parser)) { + return on_error(DEPTH_ERROR); + } + return SUCCESS; } - return SUCCESS; -} + // override to add streaming + WARN_UNUSED really_inline error_code finish() { + if ( structurals.past_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } + end_document(); + if (depth != 0) { + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); + } + bool finished = structurals.at_end(doc_parser.n_structural_indexes); + return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + } +}; + +} // namespace stage2 /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); -} -/* end file src/generic/stage2/tape_writer.h */ - -WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - error_code err = stage1(_buf, _len, false); - if (err) { return err; } - return stage2(_doc); -} - -} // namespace fallback -} // namespace simdjson -/* end file src/generic/stage2/tape_writer.h */ -#endif -#if SIMDJSON_IMPLEMENTATION_HASWELL -/* begin file src/haswell/implementation.cpp */ -/* haswell/implementation.h already included: #include "haswell/implementation.h" */ -/* begin file src/haswell/dom_parser_implementation.h */ -#ifndef SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H -#define SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H - -/* isadetection.h already included: #include "isadetection.h" */ - -namespace simdjson { -namespace haswell { - -/* begin file src/generic/dom_parser_implementation.h */ -// expectation: sizeof(scope_descriptor) = 64/8. -struct scope_descriptor { - uint32_t tape_index; // where, on the tape, does the scope ([,{) begins - uint32_t count; // how many elements in the scope -}; // struct scope_descriptor - -#ifdef SIMDJSON_USE_COMPUTED_GOTO -typedef void* ret_address_t; -#else -typedef char ret_address_t; -#endif - -class dom_parser_implementation final : public internal::dom_parser_implementation { -public: - /** Tape location of each open { or [ */ - std::unique_ptr containing_scope{}; - /** Return address of each open { or [ */ - std::unique_ptr ret_address{}; - /** Buffer passed to stage 1 */ - const uint8_t *buf{}; - /** Length passed to stage 1 */ - size_t len{0}; - /** Document passed to stage 2 */ - dom::document *doc{}; - /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ - error_code error{UNINITIALIZED}; - - really_inline dom_parser_implementation(); - dom_parser_implementation(const dom_parser_implementation &) = delete; - dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; - - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; - WARN_UNUSED error_code check_for_unclosed_array() noexcept; - WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; - WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; - WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; - WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; -}; - -/* begin file src/generic/stage1/allocate.h */ -namespace stage1 { -namespace allocate { - -// -// Allocates stage 1 internal state and outputs in the parser -// -really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { - size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; - parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); - if (!parser.structural_indexes) { return MEMALLOC; } - parser.structural_indexes[0] = 0; - parser.n_structural_indexes = 0; - return SUCCESS; -} - -} // namespace allocate -} // namespace stage1 -/* end file src/generic/stage1/allocate.h */ -/* begin file src/generic/stage2/allocate.h */ -namespace stage2 { -namespace allocate { +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { + static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); + stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); + error_code result = parser.start(len, addresses.finish); + if (result) { return result; } + // + // Read first value + // + switch (parser.structurals.current_char()) { + case '{': + FAIL_IF( parser.start_object(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.start_array(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], true); + }) + ); + goto finish; + default: + goto error; + } // -// Allocates stage 2 internal state and outputs in the parser +// Object parser parsers // -really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { - parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); - parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); - - if (!parser.ret_address || !parser.containing_scope) { - return MEMALLOC; +object_begin: + switch (parser.advance_char()) { + case '"': { + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + } + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; } - return SUCCESS; -} - -} // namespace allocate -} // namespace stage2 -/* end file src/generic/stage2/allocate.h */ -really_inline dom_parser_implementation::dom_parser_implementation() {} +object_key_parser: + FAIL_IF( parser.advance_char() != ':' ); + parser.increment_count(); + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); -// Leaving these here so they can be inlined if so desired -WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { - error_code err = stage1::allocate::set_capacity(*this, capacity); - if (err) { _capacity = 0; return err; } - _capacity = capacity; - return SUCCESS; -} +object_continue: + switch (parser.advance_char()) { + case ',': + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; + } -WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { - error_code err = stage2::allocate::set_max_depth(*this, max_depth); - if (err) { _max_depth = 0; return err; } - _max_depth = max_depth; - return SUCCESS; -} -/* end file src/generic/stage2/allocate.h */ +scope_end: + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); -} // namespace haswell -} // namespace simdjson +// +// Array parser parsers +// +array_begin: + if (parser.advance_char() == ']') { + parser.end_array(); + goto scope_end; + } + parser.increment_count(); -#endif // SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H -/* end file src/generic/stage2/allocate.h */ +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); -TARGET_HASWELL +array_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + parser.advance_char(); + goto main_array_switch; + case ']': + parser.end_array(); + goto scope_end; + default: + goto error; + } -namespace simdjson { -namespace haswell { +finish: + next_json = parser.structurals.next_structural_index(); + return parser.finish(); -WARN_UNUSED error_code implementation::create_dom_parser_implementation( - size_t capacity, - size_t max_depth, - std::unique_ptr& dst -) const noexcept { - dst.reset( new (std::nothrow) dom_parser_implementation() ); - if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); - return SUCCESS; +error: + return parser.error(); } +/* end file src/generic/stage2/streaming_structural_parser.h */ -} // namespace haswell +} // namespace fallback } // namespace simdjson +/* end file src/generic/stage2/streaming_structural_parser.h */ +#endif +#if SIMDJSON_IMPLEMENTATION_HASWELL +/* begin file src/haswell/stage1.cpp */ -UNTARGET_REGION -/* end file src/generic/stage2/allocate.h */ -/* begin file src/haswell/dom_parser_implementation.cpp */ -/* haswell/implementation.h already included: #include "haswell/implementation.h" */ -/* haswell/dom_parser_implementation.h already included: #include "haswell/dom_parser_implementation.h" */ - -// -// Stage 1 -// /* begin file src/haswell/bitmask.h */ #ifndef SIMDJSON_HASWELL_BITMASK_H #define SIMDJSON_HASWELL_BITMASK_H @@ -8066,6 +7568,7 @@ UNTARGET_REGION #endif // SIMDJSON_HASWELL_SIMD_H /* end file src/haswell/bitmanipulation.h */ /* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */ +/* haswell/implementation.h already included: #include "haswell/implementation.h" */ TARGET_HASWELL namespace simdjson { @@ -8124,21 +7627,24 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - really_inline size_t block_index(); - really_inline bool has_full_block() const; - really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - really_inline size_t get_remainder(uint8_t *dst) const; - really_inline void advance(); + really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + really_inline size_t block_index() { return idx; } + really_inline bool has_full_block() const { + return idx < lenminusstep; + } + really_inline const uint8_t *full_block() const { + return &buf[idx]; + } + really_inline bool has_remainder() const { + return idx < len; + } + really_inline void get_remainder(uint8_t *tmp_buf) const { + memset(tmp_buf, 0x20, STEP_SIZE); + memcpy(tmp_buf, buf + idx, len - idx); + } + really_inline void advance() { + idx += STEP_SIZE; + } private: const uint8_t *buf; const size_t len; @@ -8146,18 +7652,6 @@ struct buf_block_reader { size_t idx; }; -constexpr const int TITLE_SIZE = 12; - -// Routines to print masks and text for debugging bitmask operations -UNUSED static char * format_input_text_64(const uint8_t *text) { - static char *buf = (char*)malloc(sizeof(simd8x64) + 1); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -8177,34 +7671,6 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } - -template -really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - -template -really_inline size_t buf_block_reader::block_index() { return idx; } - -template -really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} - -template -really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} - -template -really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - memcpy(dst, buf + idx, len - idx); - return len - idx; -} - -template -really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; -} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -8504,15 +7970,13 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); - - // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - // Index the last (remainder) block, padded with spaces - uint8_t block[STEP_SIZE]; - if (likely(reader.get_remainder(block)) > 0) { + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); minifier.step(block, reader); } @@ -8525,94 +7989,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len); } -/* begin file src/generic/stage1/find_next_document_index.h */ -/** - * This algorithm is used to quickly identify the last structural position that - * makes up a complete document. - * - * It does this by going backwards and finding the last *document boundary* (a - * place where one value follows another without a comma between them). If the - * last document (the characters after the boundary) has an equal number of - * start and end brackets, it is considered complete. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and that means we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete document, therefore the last json buffer location is the end of the - * batch. - */ -really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (parser.buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (parser.buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - // Last document is complete, so the next document will appear after! - if (!arr_cnt && !obj_cnt) { - return parser.n_structural_indexes; - } - // Last document is incomplete; mark the document at i + 1 as the next one - return i; - } - return 0; -} - -// Skip the last character if it is partial -really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { - if (unlikely(len < 3)) { - switch (len) { - case 2: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left - return len; - case 1: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return len; - case 0: - return len; - } - } - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left - if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left - return len; -} -/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -8663,9 +8039,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -8680,7 +8056,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { using namespace simd; namespace utf8_validation { - // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -8725,7 +8100,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -8756,6 +8131,214 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } + // + // Validate the length of multibyte characters (that each multibyte character has the right number + // of continuation characters, and that all continuation characters are part of a multibyte + // character). + // + // Algorithm + // ========= + // + // This algorithm compares *expected* continuation characters with *actual* continuation bytes, + // and emits an error anytime there is a mismatch. + // + // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte + // characters, the file will look like this: + // + // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | + // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| + // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | + // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | + // | is_second_byte | | X | | | | X | | | X | | | + // | is_third_byte | | | X | | | | X | | | | | + // | is_fourth_byte | | | | X | | | | | | | | + // | expected_continuation | | X | X | X | | X | X | | X | | | + // | is_continuation | | X | X | X | | X | X | | X | | | + // + // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): + // + // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not + // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just + // floating around extra outside of any character, or that there is an illegal 5-byte character, + // or maybe it's at the beginning of the file before any characters have started; but it's an + // error in all these cases. + // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means + // we started a new character before we were finished with the current one. + // + // Getting the Previous Bytes + // -------------------------- + // + // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte + // character, we need to "shift the bytes" to find that out. This is what they mean: + // + // - `is_continuation`: if the current byte is a continuation. + // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. + // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. + // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. + // + // We use shuffles to go n bytes back, selecting part of the current `input` and part of the + // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller + // function, because the 1-byte-back data is used by other checks as well. + // + // Getting the Continuation Mask + // ----------------------------- + // + // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as + // numbers, using signed `<` and `>` operations to check if they are continuations or leads. + // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because + // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). + // + // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," + // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. + // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. + // + // When treated as signed numbers, they look like this: + // + // | Type | High Bits | Binary Range | Signed | + // |--------------|------------|--------------|--------| + // | ASCII | `0` | `01111111` | 127 | + // | | | `00000000` | 0 | + // | 4+-Byte Lead | `1111` | `11111111` | -1 | + // | | | `11110000 | -16 | + // | 3-Byte Lead | `1110` | `11101111` | -17 | + // | | | `11100000 | -32 | + // | 2-Byte Lead | `110` | `11011111` | -33 | + // | | | `11000000 | -64 | + // | Continuation | `10` | `10111111` | -65 | + // | | | `10000000 | -128 | + // + // This makes it pretty easy to get the continuation mask! It's just a single comparison: + // + // ``` + // is_continuation = input < -64` + // ``` + // + // We can do something similar for the others, but it takes two comparisons instead of one: "is + // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and + // `> -64`. Surely we can do better, they're right next to each other! + // + // Getting the is_xxx Masks: Shifting the Range + // -------------------------------------------- + // + // Notice *why* continuations were a single comparison. The actual *range* would require two + // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get + // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be + // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. + // + // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps + // ASCII down into the negative, and puts 4+-Byte Lead at the top: + // + // | Type | High Bits | Binary Range | Signed | + // |----------------------|------------|--------------|-------| + // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | + // | | | `01110000 | 112 | + // |----------------------|------------|--------------|-------| + // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | + // | | | `01100000 | 96 | + // |----------------------|------------|--------------|-------| + // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | + // | | | `01000000 | 64 | + // |----------------------|------------|--------------|-------| + // | Continuation (+ 127) | `00` | `00111111` | 63 | + // | | | `00000000 | 0 | + // |----------------------|------------|--------------|-------| + // | ASCII (+ 127) | `1` | `11111111` | -1 | + // | | | `10000000` | -128 | + // |----------------------|------------|--------------|-------| + // + // *Now* we can use signed `>` on all of them: + // + // ``` + // prev1 = input.prev<1> + // prev2 = input.prev<2> + // prev3 = input.prev<3> + // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` + // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` + // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` + // is_second_byte = prev1_flipped > 63; // 2+-byte lead + // is_third_byte = prev2_flipped > 95; // 3+-byte lead + // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead + // ``` + // + // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number + // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 + // `^`'s at a time on Haswell, but only 2 `+`'s). + // + // That doesn't look like it saved us any instructions, did it? Well, because we're adding the + // same number to all of them, we can save one of those `+ 128` operations by assembling + // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 + // to it. One more instruction saved! + // + // ``` + // prev1 = input.prev<1> + // prev3 = input.prev<3> + // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` + // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` + // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or + // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can + // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and + // then adds the result together. Same number of operations, but if the processor can run + // independent things in parallel (which most can), it runs faster. + // + // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have + // a super nice advantage in that more of them can be run at the same time (they can run on 3 + // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, + // saving us the cycle we would have earned by using +. Even more, using an instruction with a + // wider array of ports can help *other* code run ahead, too, since these instructions can "get + // out of the way," running on a port other instructions can't. + // + // Epilogue II: One More Trick + // --------------------------- + // + // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay + // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in + // check_special_cases()--but we'll talk about that there :) + // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -8893,22 +8476,16 @@ class bit_indexer { class json_structural_indexer { public: - /** - * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. - * - * @param partial Setting the partial parameter to true allows the find_structural_bits to - * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If - * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. - */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes); + really_inline json_structural_indexer(uint32_t *structural_indexes) + : indexer{structural_indexes} {} template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); json_scanner scanner{}; utf8_checker checker{}; @@ -8917,8 +8494,65 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} + +really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); + + error_code error = scanner.finish(streaming); + if (unlikely(error != SUCCESS)) { return error; } + + if (unescaped_chars_error) { + return UNESCAPED_CHARS; + } + + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /* a valid JSON file cannot have zero structural indexes - we should have + * found something */ + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { + /* the string might not be NULL terminated, but we add a virtual NULL + * ending character. */ + parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); + } + /* make it safe to dereference one beyond this array */ + parser.structural_indexes[parser.n_structural_indexes] = 0; + return checker.errors(); +} + +template<> +really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { + simd::simd8x64 in_1(block); + simd::simd8x64 in_2(block+64); + json_block block_1 = scanner.next(in_1); + json_block block_2 = scanner.next(in_2); + this->next(in_1, block_1, reader.block_index()); + this->next(in_2, block_2, reader.block_index()+64); + reader.advance(); +} + +template<> +really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { + simd::simd8x64 in_1(block); + json_block block_1 = scanner.next(in_1); + this->next(in_1, block_1, reader.block_index()); + reader.advance(); +} +// +// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. // // PERF NOTES: // We pipe 2 inputs through these stages: @@ -8936,116 +8570,41 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough // workout. // +// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. +// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, +// you may want to call on a function like trimmed_length_safe_utf8. template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { if (unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } buf_block_reader reader(buf, len); json_structural_indexer indexer(parser.structural_indexes.get()); - - // Read all but the last block while (reader.has_full_block()) { indexer.step(reader.full_block(), reader); } - // Take care of the last block (will always be there unless file is empty) - uint8_t block[STEP_SIZE]; - if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } - indexer.step(block, reader); + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); + indexer.step(block, reader); + } - return indexer.finish(parser, reader.block_index(), len, partial); -} - -template<> -really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept { - simd::simd8x64 in_1(block); - simd::simd8x64 in_2(block+64); - json_block block_1 = scanner.next(in_1); - json_block block_2 = scanner.next(in_2); - this->next(in_1, block_1, reader.block_index()); - this->next(in_2, block_2, reader.block_index()+64); - reader.advance(); -} - -template<> -really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept { - simd::simd8x64 in_1(block); - json_block block_1 = scanner.next(in_1); - this->next(in_1, block_1, reader.block_index()); - reader.advance(); -} - -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} - -really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); - - error_code error = scanner.finish(partial); - if (unlikely(error != SUCCESS)) { return error; } - - if (unescaped_chars_error) { - return UNESCAPED_CHARS; - } - - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /*** - * This is related to https://github.com/simdjson/simdjson/issues/906 - * Basically, we want to make sure that if the parsing continues beyond the last (valid) - * structural character, it quickly stops. - * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. - * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing - * continues, then it must be [,] or }. - * Suppose it is ] or }. We backtrack to the first character, what could it be that would - * not trigger an error? It could be ] or } but no, because you can't start a document that way. - * It can't be a comma, a colon or any simple value. So the only way we could continue is - * if the repeated character is [. But if so, the document must start with [. But if the document - * starts with [, it should end with ]. If we enforce that rule, then we would get - * ][[ which is invalid. - **/ - parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 2] = 0; - parser.next_structural_index = 0; - // a valid JSON file cannot have zero structural indexes - we should have found something - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; - } - if (partial) { - auto new_structural_indexes = find_next_document_index(parser); - if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. - } - parser.n_structural_indexes = new_structural_indexes; - } - return checker.errors(); + return indexer.finish(parser, reader.block_index(), len, streaming); } } // namespace stage1 /* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { - this->buf = _buf; - this->len = _len; - return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming); +WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { + return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming); } } // namespace haswell + } // namespace simdjson UNTARGET_REGION - -// -// Stage 2 -// +/* end file src/generic/stage1/json_structural_indexer.h */ +/* begin file src/haswell/stage2.cpp */ +/* haswell/implementation.h already included: #include "haswell/implementation.h" */ /* begin file src/haswell/stringparsing.h */ #ifndef SIMDJSON_HASWELL_STRINGPARSING_H #define SIMDJSON_HASWELL_STRINGPARSING_H @@ -9456,10 +9015,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 x + // can't fit in binary64. The maximal value is about 1.7976931348623157 × // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe. The estimate for the total number + // are about 10^80 atoms in the universe.  The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -9479,6 +9038,26 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } +// We need to check that the character following a zero is valid. This is +// probably frequent and it is harder than it looks. We are building all of this +// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... +const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +really_inline bool +is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { + return structural_or_whitespace_or_exponent_or_decimal_negated[c]; +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -9556,14 +9135,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -9576,12 +9155,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.append_s64(i); + writer.write_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.append_u64(i); + writer.write_u64(i); } } return is_structural_or_whitespace(*p); @@ -9591,7 +9170,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -9615,10 +9194,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W &writer) { + W writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.append_s64(0); // always write zero + writer.write_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -9638,7 +9217,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_integer(*p)) { + if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -9762,7 +9341,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (unlikely(digit_count >= 19)) { + if (digit_count >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -9770,22 +9349,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer when we passed it to the - // slow_float_parsing() function, so we have to skip those tape spots now that we've returned - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -9794,7 +9365,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -9809,14 +9380,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - bool success = parse_large_integer(src, writer, found_minus); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_large_integer(); - return success; + return parse_large_integer(src, writer, found_minus); } i = negative ? 0 - i : i; - writer.append_s64(i); + writer.write_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -9841,72 +9408,6 @@ TARGET_HASWELL namespace simdjson { namespace haswell { -/* begin file src/generic/stage2/logger.h */ -// This is for an internal-only stage 2 specific logger. -// Set LOG_ENABLED = true to log what stage 2 is doing! -namespace logger { - static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; - - static constexpr const bool LOG_ENABLED = false; - static constexpr const int LOG_EVENT_LEN = 30; - static constexpr const int LOG_BUFFER_LEN = 20; - static constexpr const int LOG_DETAIL_LEN = 50; - static constexpr const int LOG_INDEX_LEN = 10; - - static int log_depth; // Not threadsafe. Log only. - - // Helper to turn unprintable or newline characters into spaces - static really_inline char printable_char(char c) { - if (c >= 0x20) { - return c; - } else { - return ' '; - } - } - - // Print the header and set up log_start - static really_inline void log_start() { - if (LOG_ENABLED) { - log_depth = 0; - printf("\n"); - printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); - printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); - } - } - - static really_inline void log_string(const char *message) { - if (LOG_ENABLED) { - printf("%s\n", message); - } - } - - // Logs a single line of - template - static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { - if (LOG_ENABLED) { - printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); - { - // Print the next N characters in the buffer. - printf("| "); - // Otherwise, print the characters starting from the buffer position. - // Print spaces for unprintable or newline characters. - for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -10009,25 +9502,32 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, parser.len); - memset(copy + parser.len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), *current_structural); + memcpy(copy, buf, len); + memset(copy + len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), idx); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return current_structural >= &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 > n_structural_indexes; } really_inline bool at_end(uint32_t n_structural_indexes) { - return current_structural == &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 == n_structural_indexes; } - really_inline bool at_beginning() { - return current_structural == parser.structural_indexes.get(); + really_inline size_t next_structural_index() { + return next_structural; } + + const uint8_t* const buf; + const size_t len; + const uint32_t* const structural_indexes; + size_t next_structural; // next structural index + size_t idx{0}; // location of the structural character in the input (buf) + uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -10039,105 +9539,8 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { -namespace { // Make everything here private - -/* begin file src/generic/stage2/tape_writer.h */ -struct tape_writer { - /** The next place to write to tape */ - uint64_t *next_tape_loc; - - /** Write a signed 64-bit value to tape. */ - really_inline void append_s64(int64_t value) noexcept; - - /** Write an unsigned 64-bit value to tape. */ - really_inline void append_u64(uint64_t value) noexcept; - - /** Write a double value to tape. */ - really_inline void append_double(double value) noexcept; - - /** - * Append a tape entry (an 8-bit type,and 56 bits worth of value). - */ - really_inline void append(uint64_t val, internal::tape_type t) noexcept; - - /** - * Skip the current tape entry without writing. - * - * Used to skip the start of the container, since we'll come back later to fill it in when the - * container ends. - */ - really_inline void skip() noexcept; - - /** - * Skip the number of tape entries necessary to write a large u64 or i64. - */ - really_inline void skip_large_integer() noexcept; - - /** - * Skip the number of tape entries necessary to write a double. - */ - really_inline void skip_double() noexcept; - - /** - * Write a value to a known location on tape. - * - * Used to go back and write out the start of a container after the container ends. - */ - really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; - -private: - /** - * Append both the tape entry, and a supplementary value following it. Used for types that need - * all 64 bits, such as double and uint64_t. - */ - template - really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; -}; // struct number_writer - -really_inline void tape_writer::append_s64(int64_t value) noexcept { - append2(0, value, internal::tape_type::INT64); -} - -really_inline void tape_writer::append_u64(uint64_t value) noexcept { - append(0, internal::tape_type::UINT64); - *next_tape_loc = value; - next_tape_loc++; -} - -/** Write a double value to tape. */ -really_inline void tape_writer::append_double(double value) noexcept { - append2(0, value, internal::tape_type::DOUBLE); -} - -really_inline void tape_writer::skip() noexcept { - next_tape_loc++; -} - -really_inline void tape_writer::skip_large_integer() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::skip_double() noexcept { - next_tape_loc += 2; -} -really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { - *next_tape_loc = val | ((uint64_t(char(t))) << 56); - next_tape_loc++; -} - -template -really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { - append(val, t); - static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); - memcpy(next_tape_loc, &val2, sizeof(val2)); - next_tape_loc++; -} - -really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { - tape_loc = val | ((uint64_t(char(t))) << 56); -} -/* end file src/generic/stage2/tape_writer.h */ +using internal::ret_address; #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -10168,88 +9571,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address_t array_begin; - ret_address_t array_continue; - ret_address_t error; - ret_address_t finish; - ret_address_t object_begin; - ret_address_t object_continue; + ret_address array_begin; + ret_address array_continue; + ret_address error; + ret_address finish; + ret_address object_begin; + ret_address object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct structural_parser : structural_iterator { - /** Lets you append to the tape */ - tape_writer tape; +struct number_writer { + parser &doc_parser; + + really_inline void write_s64(int64_t value) noexcept { + write_tape(0, internal::tape_type::INT64); + std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); + ++doc_parser.current_loc; + } + really_inline void write_u64(uint64_t value) noexcept { + write_tape(0, internal::tape_type::UINT64); + doc_parser.doc.tape[doc_parser.current_loc++] = value; + } + really_inline void write_double(double value) noexcept { + write_tape(0, internal::tape_type::DOUBLE); + static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); + memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); + // doc.tape[doc.current_loc++] = *((uint64_t *)&d); + } + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + } +}; // struct number_writer + +struct structural_parser { + structural_iterator structurals; + parser &doc_parser; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc; - /** Current depth (nested objects and arrays) */ - uint32_t depth{0}; - - // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations - really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) - : structural_iterator(_parser, start_structural_index), - tape{parser.doc->tape.get()}, - current_string_buf_loc{parser.doc->string_buf.get()} { - } - - WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc{}; + uint32_t depth; + + really_inline structural_parser( + const uint8_t *buf, + size_t len, + parser &_doc_parser, + uint32_t next_structural = 0 + ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} + + WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { + doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; + doc_parser.containing_scope[depth].count = 0; + write_tape(0, type); // if the document is correct, this gets rewritten later + doc_parser.ret_address[depth] = continue_state; depth++; - bool exceeded_max_depth = depth >= parser.max_depth(); - if (exceeded_max_depth) { log_error("Exceeded max depth!"); } - return exceeded_max_depth; + return depth >= doc_parser.max_depth(); } - WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { - log_start_value("document"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_document(ret_address continue_state) { + return start_scope(internal::tape_type::ROOT, continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { - log_start_value("object"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_object(ret_address continue_state) { + return start_scope(internal::tape_type::START_OBJECT, continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { - log_start_value("array"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_array(ret_address continue_state) { + return start_scope(internal::tape_type::START_ARRAY, continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { + really_inline void end_scope(internal::tape_type type) noexcept { depth--; - // write our doc->tape location to the header scope + // write our doc.tape location to the header scope // The root scope gets written *at* the previous location. - tape.append(parser.containing_scope[depth].tape_index, end); + write_tape(doc_parser.containing_scope[depth].tape_index, type); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; - const uint32_t count = parser.containing_scope[depth].count; + const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; + const uint32_t count = doc_parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); - } - - really_inline uint32_t next_tape_index() { - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); + // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] + doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); } really_inline void end_object() { - log_end_value("object"); - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + end_scope(internal::tape_type::END_OBJECT); } really_inline void end_array() { - log_end_value("array"); - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + end_scope(internal::tape_type::END_ARRAY); } really_inline void end_document() { - log_end_value("document"); - end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); + end_scope(internal::tape_type::ROOT); + } + + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); } // increment_count increments the count of keys in an object or values in an array. @@ -10257,16 +9674,17 @@ struct structural_parser : structural_iterator { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - // we advance the point, accounting for the fact that we have a NULL termination - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); + /* we advance the point, accounting for the fact that we have a NULL + * termination */ + write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline void on_end_string(uint8_t *dst) noexcept { + really_inline bool on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -10276,49 +9694,73 @@ struct structural_parser : structural_iterator { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; + return true; } - WARN_UNUSED really_inline bool parse_string(bool key = false) { - log_value(key ? "key" : "string"); + WARN_UNUSED really_inline bool parse_string() { uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(current(), dst); + dst = stringparsing::parse_string(structurals.current(), dst); if (dst == nullptr) { - log_error("Invalid escape in string"); return true; } - on_end_string(dst); - return false; + return !on_end_string(dst); } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - log_value("number"); - bool succeeded = numberparsing::parse_number(src, found_minus, tape); - if (!succeeded) { log_error("Invalid number"); } - return !succeeded; + number_writer writer{doc_parser}; + return !numberparsing::parse_number(src, found_minus, writer); } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(current(), found_minus); + return parse_number(structurals.current(), found_minus); + } + + WARN_UNUSED really_inline bool parse_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; + } + + WARN_UNUSED really_inline bool parse_single_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; } - WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { - switch (advance_char()) { + WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { + switch (structurals.current_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': - log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); - tape.append(0, internal::tape_type::TRUE_VALUE); - return continue_state; - case 'f': - log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); - tape.append(0, internal::tape_type::FALSE_VALUE); - return continue_state; - case 'n': - log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); - tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parse_atom() ); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -10334,27 +9776,40 @@ struct structural_parser : structural_iterator { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: - log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { + // the string might not be NULL terminated. + if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } end_document(); - parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); - if (depth != 0) { - log_error("Unclosed objects or arrays!"); - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); } - return SUCCESS; + return on_success(SUCCESS); + } + + really_inline error_code on_error(error_code new_error_code) noexcept { + doc_parser.error = new_error_code; + return new_error_code; + } + really_inline error_code on_success(error_code success_code) noexcept { + doc_parser.error = success_code; + doc_parser.valid = true; + return success_code; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by parser.init_stage2(), + /* We do not need the next line because this is done by doc_parser.init_stage2(), * pessimistically. - * parser.is_valid = false; + * doc_parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -10362,12 +9817,12 @@ struct structural_parser : structural_iterator { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= parser.max_depth()) { - return parser.error = DEPTH_ERROR; + if (depth >= doc_parser.max_depth()) { + return on_error(DEPTH_ERROR); } - switch (current_char()) { + switch (structurals.current_char()) { case '"': - return parser.error = STRING_ERROR; + return on_error(STRING_ERROR); case '0': case '1': case '2': @@ -10379,173 +9834,302 @@ struct structural_parser : structural_iterator { case '8': case '9': case '-': - return parser.error = NUMBER_ERROR; + return on_error(NUMBER_ERROR); case 't': - return parser.error = T_ATOM_ERROR; + return on_error(T_ATOM_ERROR); case 'n': - return parser.error = N_ATOM_ERROR; + return on_error(N_ATOM_ERROR); case 'f': - return parser.error = F_ATOM_ERROR; + return on_error(F_ATOM_ERROR); default: - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); } } really_inline void init() { - log_start(); - parser.error = UNINITIALIZED; + current_string_buf_loc = doc_parser.doc.string_buf.get(); + doc_parser.current_loc = 0; + doc_parser.valid = false; + doc_parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { - // If there are no structurals left, return EMPTY - if (at_end(parser.n_structural_indexes)) { - return parser.error = EMPTY; + WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { + init(); // sets is_valid to false + if (len > doc_parser.capacity()) { + return CAPACITY; } - - init(); + // Advance to the first character as soon as possible + structurals.advance_char(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return parser.error = DEPTH_ERROR; + return on_error(DEPTH_ERROR); } return SUCCESS; } - really_inline void log_value(const char *type) { - logger::log_line(*this, "", type, ""); + really_inline char advance_char() { + return structurals.advance_char(); + } +}; + +// Redefine FAIL_IF to use goto since it'll be used inside the function now +#undef FAIL_IF +#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } + +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { + static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); + stage2::structural_parser parser(buf, len, doc_parser); + error_code result = parser.start(len, addresses.finish); + if (result) { return result; } + + // + // Read first value + // + switch (parser.structurals.current_char()) { + case '{': + FAIL_IF( parser.start_object(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.start_array(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], true); + }) + ); + goto finish; + default: + goto error; + } + +// +// Object parser states +// +object_begin: + switch (parser.advance_char()) { + case '"': { + parser.increment_count(); + FAIL_IF( parser.parse_string() ); + goto object_key_state; + } + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; + } + +object_key_state: + FAIL_IF( parser.advance_char() != ':' ); + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); + +object_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_state; + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; + } + +scope_end: + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + +// +// Array parser states +// +array_begin: + if (parser.advance_char() == ']') { + parser.end_array(); + goto scope_end; } + parser.increment_count(); - static really_inline void log_start() { - logger::log_start(); +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); + +array_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + parser.advance_char(); + goto main_array_switch; + case ']': + parser.end_array(); + goto scope_end; + default: + goto error; } - really_inline void log_start_value(const char *type) { - logger::log_line(*this, "+", type, ""); - if (logger::LOG_ENABLED) { logger::log_depth++; } +finish: + return parser.finish(); + +error: + return parser.error(); +} + +WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { + error_code code = stage1(buf, len, doc_parser, false); + if (!code) { + code = stage2(buf, len, doc_parser); } + return code; +} +/* end file src/generic/stage2/structural_parser.h */ +/* begin file src/generic/stage2/streaming_structural_parser.h */ +namespace stage2 { + +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} - really_inline void log_end_value(const char *type) { - if (logger::LOG_ENABLED) { logger::log_depth--; } - logger::log_line(*this, "-", type, ""); + // override to add streaming + WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { + init(); // sets is_valid to false + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (start_document(finish_parser)) { + return on_error(DEPTH_ERROR); + } + return SUCCESS; } - really_inline void log_error(const char *error) { - logger::log_line(*this, "", "ERROR", error); + // override to add streaming + WARN_UNUSED really_inline error_code finish() { + if ( structurals.past_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } + end_document(); + if (depth != 0) { + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); + } + bool finished = structurals.at_end(doc_parser.n_structural_indexes); + return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); } -}; // struct structural_parser +}; -// Redefine FAIL_IF to use goto since it'll be used inside the function now -#undef FAIL_IF -#define FAIL_IF(EXPR) { if (EXPR) { goto error; } } +} // namespace stage2 -template -WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { - dom_parser.doc = &doc; +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); - error_code result = parser.start(addresses.finish); + stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } - // // Read first value // - switch (parser.current_char()) { + switch (parser.structurals.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); - // Make sure the outer array is closed before continuing; otherwise, there are ways we could get - // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (!STREAMING) { - if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - goto error; - } - } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': - parser.log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::TRUE_VALUE); - goto finish; - case 'f': - parser.log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::FALSE_VALUE); - goto finish; - case 'n': - parser.log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: - parser.log_error("Document starts with a non-value character"); goto error; } // -// Object parser states +// Object parser parsers // object_begin: switch (parser.advance_char()) { case '"': { - parser.increment_count(); - FAIL_IF( parser.parse_string(true) ); - goto object_key_state; + FAIL_IF( parser.parse_string() ); + goto object_key_parser; } case '}': parser.end_object(); goto scope_end; default: - parser.log_error("Object does not start with a key"); goto error; } -object_key_state: - if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } +object_key_parser: + FAIL_IF( parser.advance_char() != ':' ); + parser.increment_count(); + parser.advance_char(); GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': - parser.increment_count(); - if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } - FAIL_IF( parser.parse_string(true) ); - goto object_key_state; + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_parser; case '}': parser.end_object(); goto scope_end; default: - parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.parser.ret_address[parser.depth] ); + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); // -// Array parser states +// Array parser parsers // array_begin: - if (parser.peek_next_char() == ']') { - parser.advance_char(); + if (parser.advance_char() == ']') { parser.end_array(); goto scope_end; } @@ -10560,208 +10144,31 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case ',': parser.increment_count(); + parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: - parser.log_error("Missing comma between array values"); goto error; } finish: + next_json = parser.structurals.next_structural_index(); return parser.finish(); error: return parser.error(); } - -} // namespace {} -} // namespace stage2 - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - error_code result = stage2::parse_structurals(*this, _doc); - if (result) { return result; } - - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return error = TAPE_ERROR; - } - - return SUCCESS; -} - -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); -} -/* end file src/generic/stage2/tape_writer.h */ - -WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - error_code err = stage1(_buf, _len, false); - if (err) { return err; } - return stage2(_doc); -} +/* end file src/generic/stage2/streaming_structural_parser.h */ } // namespace haswell } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage2/tape_writer.h */ +/* end file src/generic/stage2/streaming_structural_parser.h */ #endif #if SIMDJSON_IMPLEMENTATION_WESTMERE -/* begin file src/westmere/implementation.cpp */ -/* westmere/implementation.h already included: #include "westmere/implementation.h" */ -/* begin file src/westmere/dom_parser_implementation.h */ -#ifndef SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H -#define SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H - -/* isadetection.h already included: #include "isadetection.h" */ - -namespace simdjson { -namespace westmere { - -/* begin file src/generic/dom_parser_implementation.h */ -// expectation: sizeof(scope_descriptor) = 64/8. -struct scope_descriptor { - uint32_t tape_index; // where, on the tape, does the scope ([,{) begins - uint32_t count; // how many elements in the scope -}; // struct scope_descriptor - -#ifdef SIMDJSON_USE_COMPUTED_GOTO -typedef void* ret_address_t; -#else -typedef char ret_address_t; -#endif - -class dom_parser_implementation final : public internal::dom_parser_implementation { -public: - /** Tape location of each open { or [ */ - std::unique_ptr containing_scope{}; - /** Return address of each open { or [ */ - std::unique_ptr ret_address{}; - /** Buffer passed to stage 1 */ - const uint8_t *buf{}; - /** Length passed to stage 1 */ - size_t len{0}; - /** Document passed to stage 2 */ - dom::document *doc{}; - /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */ - error_code error{UNINITIALIZED}; - - really_inline dom_parser_implementation(); - dom_parser_implementation(const dom_parser_implementation &) = delete; - dom_parser_implementation & operator=(const dom_parser_implementation &) = delete; - - WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final; - WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final; - WARN_UNUSED error_code check_for_unclosed_array() noexcept; - WARN_UNUSED error_code stage2(dom::document &doc) noexcept final; - WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final; - WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final; - WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final; -}; - -/* begin file src/generic/stage1/allocate.h */ -namespace stage1 { -namespace allocate { - -// -// Allocates stage 1 internal state and outputs in the parser -// -really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) { - size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; - parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); - if (!parser.structural_indexes) { return MEMALLOC; } - parser.structural_indexes[0] = 0; - parser.n_structural_indexes = 0; - return SUCCESS; -} - -} // namespace allocate -} // namespace stage1 -/* end file src/generic/stage1/allocate.h */ -/* begin file src/generic/stage2/allocate.h */ -namespace stage2 { -namespace allocate { - -// -// Allocates stage 2 internal state and outputs in the parser -// -really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) { - parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]); - parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]); - - if (!parser.ret_address || !parser.containing_scope) { - return MEMALLOC; - } - return SUCCESS; -} - -} // namespace allocate -} // namespace stage2 -/* end file src/generic/stage2/allocate.h */ - -really_inline dom_parser_implementation::dom_parser_implementation() {} - -// Leaving these here so they can be inlined if so desired -WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept { - error_code err = stage1::allocate::set_capacity(*this, capacity); - if (err) { _capacity = 0; return err; } - _capacity = capacity; - return SUCCESS; -} - -WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept { - error_code err = stage2::allocate::set_max_depth(*this, max_depth); - if (err) { _max_depth = 0; return err; } - _max_depth = max_depth; - return SUCCESS; -} -/* end file src/generic/stage2/allocate.h */ - -} // namespace westmere -} // namespace simdjson - -#endif // SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H -/* end file src/generic/stage2/allocate.h */ - -TARGET_HASWELL - -namespace simdjson { -namespace westmere { - -WARN_UNUSED error_code implementation::create_dom_parser_implementation( - size_t capacity, - size_t max_depth, - std::unique_ptr& dst -) const noexcept { - dst.reset( new (std::nothrow) dom_parser_implementation() ); - if (!dst) { return MEMALLOC; } - dst->set_capacity(capacity); - dst->set_max_depth(max_depth); - return SUCCESS; -} - -} // namespace westmere -} // namespace simdjson - -UNTARGET_REGION -/* end file src/generic/stage2/allocate.h */ -/* begin file src/westmere/dom_parser_implementation.cpp */ -/* westmere/implementation.h already included: #include "westmere/implementation.h" */ -/* westmere/dom_parser_implementation.h already included: #include "westmere/dom_parser_implementation.h" */ - -// -// Stage 1 -// +/* begin file src/westmere/stage1.cpp */ /* begin file src/westmere/bitmask.h */ #ifndef SIMDJSON_WESTMERE_BITMASK_H #define SIMDJSON_WESTMERE_BITMASK_H @@ -11332,21 +10739,24 @@ really_inline simd8 must_be_continuation(simd8 prev1, simd8 struct buf_block_reader { public: - really_inline buf_block_reader(const uint8_t *_buf, size_t _len); - really_inline size_t block_index(); - really_inline bool has_full_block() const; - really_inline const uint8_t *full_block() const; - /** - * Get the last block, padded with spaces. - * - * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this - * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there - * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding. - * - * @return the number of effective characters in the last block. - */ - really_inline size_t get_remainder(uint8_t *dst) const; - really_inline void advance(); + really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} + really_inline size_t block_index() { return idx; } + really_inline bool has_full_block() const { + return idx < lenminusstep; + } + really_inline const uint8_t *full_block() const { + return &buf[idx]; + } + really_inline bool has_remainder() const { + return idx < len; + } + really_inline void get_remainder(uint8_t *tmp_buf) const { + memset(tmp_buf, 0x20, STEP_SIZE); + memcpy(tmp_buf, buf + idx, len - idx); + } + really_inline void advance() { + idx += STEP_SIZE; + } private: const uint8_t *buf; const size_t len; @@ -11354,18 +10764,6 @@ struct buf_block_reader { size_t idx; }; -constexpr const int TITLE_SIZE = 12; - -// Routines to print masks and text for debugging bitmask operations -UNUSED static char * format_input_text_64(const uint8_t *text) { - static char *buf = (char*)malloc(sizeof(simd8x64) + 1); - for (size_t i=0; i); i++) { - buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]); - } - buf[sizeof(simd8x64)] = '\0'; - return buf; -} - // Routines to print masks and text for debugging bitmask operations UNUSED static char * format_input_text(const simd8x64 in) { static char *buf = (char*)malloc(sizeof(simd8x64) + 1); @@ -11385,34 +10783,6 @@ UNUSED static char * format_mask(uint64_t mask) { buf[64] = '\0'; return buf; } - -template -really_inline buf_block_reader::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {} - -template -really_inline size_t buf_block_reader::block_index() { return idx; } - -template -really_inline bool buf_block_reader::has_full_block() const { - return idx < lenminusstep; -} - -template -really_inline const uint8_t *buf_block_reader::full_block() const { - return &buf[idx]; -} - -template -really_inline size_t buf_block_reader::get_remainder(uint8_t *dst) const { - memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once. - memcpy(dst, buf + idx, len - idx); - return len - idx; -} - -template -really_inline void buf_block_reader::advance() { - idx += STEP_SIZE; -} /* end file src/generic/stage1/buf_block_reader.h */ /* begin file src/generic/stage1/json_string_scanner.h */ namespace stage1 { @@ -11712,15 +11082,13 @@ template error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept { buf_block_reader reader(buf, len); json_minifier minifier(dst); - - // Index the first n-1 blocks while (reader.has_full_block()) { minifier.step(reader.full_block(), reader); } - // Index the last (remainder) block, padded with spaces - uint8_t block[STEP_SIZE]; - if (likely(reader.get_remainder(block)) > 0) { + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); minifier.step(block, reader); } @@ -11733,94 +11101,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len); } -/* begin file src/generic/stage1/find_next_document_index.h */ -/** - * This algorithm is used to quickly identify the last structural position that - * makes up a complete document. - * - * It does this by going backwards and finding the last *document boundary* (a - * place where one value follows another without a comma between them). If the - * last document (the characters after the boundary) has an equal number of - * start and end brackets, it is considered complete. - * - * Simply put, we iterate over the structural characters, starting from - * the end. We consider that we found the end of a JSON document when the - * first element of the pair is NOT one of these characters: '{' '[' ';' ',' - * and when the second element is NOT one of these characters: '}' '}' ';' ','. - * - * This simple comparison works most of the time, but it does not cover cases - * where the batch's structural indexes contain a perfect amount of documents. - * In such a case, we do not have access to the structural index which follows - * the last document, therefore, we do not have access to the second element in - * the pair, and that means we cannot identify the last document. To fix this - * issue, we keep a count of the open and closed curly/square braces we found - * while searching for the pair. When we find a pair AND the count of open and - * closed curly/square braces is the same, we know that we just passed a - * complete document, therefore the last json buffer location is the end of the - * batch. - */ -really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) { - // TODO don't count separately, just figure out depth - auto arr_cnt = 0; - auto obj_cnt = 0; - for (auto i = parser.n_structural_indexes - 1; i > 0; i--) { - auto idxb = parser.structural_indexes[i]; - switch (parser.buf[idxb]) { - case ':': - case ',': - continue; - case '}': - obj_cnt--; - continue; - case ']': - arr_cnt--; - continue; - case '{': - obj_cnt++; - break; - case '[': - arr_cnt++; - break; - } - auto idxa = parser.structural_indexes[i - 1]; - switch (parser.buf[idxa]) { - case '{': - case '[': - case ':': - case ',': - continue; - } - // Last document is complete, so the next document will appear after! - if (!arr_cnt && !obj_cnt) { - return parser.n_structural_indexes; - } - // Last document is incomplete; mark the document at i + 1 as the next one - return i; - } - return 0; -} - -// Skip the last character if it is partial -really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { - if (unlikely(len < 3)) { - switch (len) { - case 2: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left - return len; - case 1: - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - return len; - case 0: - return len; - } - } - if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left - if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left - return len; -} -/* end file src/generic/stage1/find_next_document_index.h */ /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */ // // Detect Unicode errors. @@ -11871,9 +11151,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { // support values with more than 23 bits (which a 4-byte character supports). // // e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000) -// +// // Legal utf-8 byte sequences per http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94: -// +// // Code Points 1st 2s 3s 4s // U+0000..U+007F 00..7F // U+0080..U+07FF C2..DF 80..BF @@ -11888,7 +11168,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) { using namespace simd; namespace utf8_validation { - // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)". // // Find special case UTF-8 errors where the character is technically readable (has the right length) @@ -11933,7 +11212,7 @@ namespace utf8_validation { const simd8 byte_1_high = prev1.shr<4>().lookup_16( // [0___]____ (ASCII) - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, // [10__]____ (continuation) 0, 0, 0, 0, @@ -11964,6 +11243,214 @@ namespace utf8_validation { return byte_1_high & byte_1_low & byte_2_high; } + // + // Validate the length of multibyte characters (that each multibyte character has the right number + // of continuation characters, and that all continuation characters are part of a multibyte + // character). + // + // Algorithm + // ========= + // + // This algorithm compares *expected* continuation characters with *actual* continuation bytes, + // and emits an error anytime there is a mismatch. + // + // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte + // characters, the file will look like this: + // + // | Character | 𝄞 | | | | ₿ | | | ֏ | | a | b | + // |-----------------------|----|----|----|----|----|----|----|----|----|----|----| + // | Character Length | 4 | | | | 3 | | | 2 | | 1 | 1 | + // | Byte | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 | + // | is_second_byte | | X | | | | X | | | X | | | + // | is_third_byte | | | X | | | | X | | | | | + // | is_fourth_byte | | | | X | | | | | | | | + // | expected_continuation | | X | X | X | | X | X | | X | | | + // | is_continuation | | X | X | X | | X | X | | X | | | + // + // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation): + // + // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not + // part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just + // floating around extra outside of any character, or that there is an illegal 5-byte character, + // or maybe it's at the beginning of the file before any characters have started; but it's an + // error in all these cases. + // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means + // we started a new character before we were finished with the current one. + // + // Getting the Previous Bytes + // -------------------------- + // + // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte + // character, we need to "shift the bytes" to find that out. This is what they mean: + // + // - `is_continuation`: if the current byte is a continuation. + // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character. + // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character. + // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character. + // + // We use shuffles to go n bytes back, selecting part of the current `input` and part of the + // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller + // function, because the 1-byte-back data is used by other checks as well. + // + // Getting the Continuation Mask + // ----------------------------- + // + // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as + // numbers, using signed `<` and `>` operations to check if they are continuations or leads. + // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because + // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones). + // + // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads," + // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them. + // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0. + // + // When treated as signed numbers, they look like this: + // + // | Type | High Bits | Binary Range | Signed | + // |--------------|------------|--------------|--------| + // | ASCII | `0` | `01111111` | 127 | + // | | | `00000000` | 0 | + // | 4+-Byte Lead | `1111` | `11111111` | -1 | + // | | | `11110000 | -16 | + // | 3-Byte Lead | `1110` | `11101111` | -17 | + // | | | `11100000 | -32 | + // | 2-Byte Lead | `110` | `11011111` | -33 | + // | | | `11000000 | -64 | + // | Continuation | `10` | `10111111` | -65 | + // | | | `10000000 | -128 | + // + // This makes it pretty easy to get the continuation mask! It's just a single comparison: + // + // ``` + // is_continuation = input < -64` + // ``` + // + // We can do something similar for the others, but it takes two comparisons instead of one: "is + // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and + // `> -64`. Surely we can do better, they're right next to each other! + // + // Getting the is_xxx Masks: Shifting the Range + // -------------------------------------------- + // + // Notice *why* continuations were a single comparison. The actual *range* would require two + // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get + // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be + // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`. + // + // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps + // ASCII down into the negative, and puts 4+-Byte Lead at the top: + // + // | Type | High Bits | Binary Range | Signed | + // |----------------------|------------|--------------|-------| + // | 4+-Byte Lead (+ 127) | `0111` | `01111111` | 127 | + // | | | `01110000 | 112 | + // |----------------------|------------|--------------|-------| + // | 3-Byte Lead (+ 127) | `0110` | `01101111` | 111 | + // | | | `01100000 | 96 | + // |----------------------|------------|--------------|-------| + // | 2-Byte Lead (+ 127) | `010` | `01011111` | 95 | + // | | | `01000000 | 64 | + // |----------------------|------------|--------------|-------| + // | Continuation (+ 127) | `00` | `00111111` | 63 | + // | | | `00000000 | 0 | + // |----------------------|------------|--------------|-------| + // | ASCII (+ 127) | `1` | `11111111` | -1 | + // | | | `10000000` | -128 | + // |----------------------|------------|--------------|-------| + // + // *Now* we can use signed `>` on all of them: + // + // ``` + // prev1 = input.prev<1> + // prev2 = input.prev<2> + // prev3 = input.prev<3> + // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128` + // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128` + // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128` + // is_second_byte = prev1_flipped > 63; // 2+-byte lead + // is_third_byte = prev2_flipped > 95; // 3+-byte lead + // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead + // ``` + // + // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number + // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3 + // `^`'s at a time on Haswell, but only 2 `+`'s). + // + // That doesn't look like it saved us any instructions, did it? Well, because we're adding the + // same number to all of them, we can save one of those `+ 128` operations by assembling + // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128 + // to it. One more instruction saved! + // + // ``` + // prev1 = input.prev<1> + // prev3 = input.prev<3> + // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128` + // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128` + // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // | C -> ^ D, or + // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can + // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and + // then adds the result together. Same number of operations, but if the processor can run + // independent things in parallel (which most can), it runs faster. + // + // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have + // a super nice advantage in that more of them can be run at the same time (they can run on 3 + // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C, + // saving us the cycle we would have earned by using +. Even more, using an instruction with a + // wider array of ports can help *other* code run ahead, too, since these instructions can "get + // out of the way," running on a port other instructions can't. + // + // Epilogue II: One More Trick + // --------------------------- + // + // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay + // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in + // check_special_cases()--but we'll talk about that there :) + // really_inline simd8 check_multibyte_lengths(simd8 input, simd8 prev_input, simd8 prev1) { simd8 prev2 = input.prev<2>(prev_input); simd8 prev3 = input.prev<3>(prev_input); @@ -12101,22 +11588,16 @@ class bit_indexer { class json_structural_indexer { public: - /** - * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. - * - * @param partial Setting the partial parameter to true allows the find_structural_bits to - * tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If - * you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8. - */ template - static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept; + static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept; private: - really_inline json_structural_indexer(uint32_t *structural_indexes); + really_inline json_structural_indexer(uint32_t *structural_indexes) + : indexer{structural_indexes} {} template really_inline void step(const uint8_t *block, buf_block_reader &reader) noexcept; really_inline void next(simd::simd8x64 in, json_block block, size_t idx); - really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial); + really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming); json_scanner scanner{}; utf8_checker checker{}; @@ -12125,44 +11606,42 @@ class json_structural_indexer { uint64_t unescaped_chars_error = 0; }; -really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {} +really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { + uint64_t unescaped = in.lteq(0x1F); + checker.check_next_input(in); + indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser + prev_structurals = block.structural_start(); + unescaped_chars_error |= block.non_quote_inside_string(unescaped); +} -// -// PERF NOTES: -// We pipe 2 inputs through these stages: -// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load -// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. -// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. -// The output of step 1 depends entirely on this information. These functions don't quite use -// up enough CPU: the second half of the functions is highly serial, only using 1 execution core -// at a time. The second input's scans has some dependency on the first ones finishing it, but -// they can make a lot of progress before they need that information. -// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that -// to finish: utf-8 checks and generating the output from the last iteration. -// -// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all -// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough -// workout. -// -template -error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept { - if (unlikely(len > parser.capacity())) { return CAPACITY; } - if (partial) { len = trim_partial_utf8(buf, len); } +really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) { + // Write out the final iteration's structurals + indexer.write(uint32_t(idx-64), prev_structurals); - buf_block_reader reader(buf, len); - json_structural_indexer indexer(parser.structural_indexes.get()); + error_code error = scanner.finish(streaming); + if (unlikely(error != SUCCESS)) { return error; } - // Read all but the last block - while (reader.has_full_block()) { - indexer.step(reader.full_block(), reader); + if (unescaped_chars_error) { + return UNESCAPED_CHARS; } - // Take care of the last block (will always be there unless file is empty) - uint8_t block[STEP_SIZE]; - if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; } - indexer.step(block, reader); - - return indexer.finish(parser, reader.block_index(), len, partial); + parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); + /* a valid JSON file cannot have zero structural indexes - we should have + * found something */ + if (unlikely(parser.n_structural_indexes == 0u)) { + return EMPTY; + } + if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { + return UNEXPECTED_ERROR; + } + if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) { + /* the string might not be NULL terminated, but we add a virtual NULL + * ending character. */ + parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len); + } + /* make it safe to dereference one beyond this array */ + parser.structural_indexes[parser.n_structural_indexes] = 0; + return checker.errors(); } template<> @@ -12184,76 +11663,60 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b reader.advance(); } -really_inline void json_structural_indexer::next(simd::simd8x64 in, json_block block, size_t idx) { - uint64_t unescaped = in.lteq(0x1F); - checker.check_next_input(in); - indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser - prev_structurals = block.structural_start(); - unescaped_chars_error |= block.non_quote_inside_string(unescaped); -} - -really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) { - // Write out the final iteration's structurals - indexer.write(uint32_t(idx-64), prev_structurals); - - error_code error = scanner.finish(partial); - if (unlikely(error != SUCCESS)) { return error; } +// +// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes. +// +// PERF NOTES: +// We pipe 2 inputs through these stages: +// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load +// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available. +// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path. +// The output of step 1 depends entirely on this information. These functions don't quite use +// up enough CPU: the second half of the functions is highly serial, only using 1 execution core +// at a time. The second input's scans has some dependency on the first ones finishing it, but +// they can make a lot of progress before they need that information. +// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that +// to finish: utf-8 checks and generating the output from the last iteration. +// +// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all +// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough +// workout. +// +// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings. +// The caller should still ensure that the input is valid UTF-8. If you are processing substrings, +// you may want to call on a function like trimmed_length_safe_utf8. +template +error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept { + if (unlikely(len > parser.capacity())) { return CAPACITY; } - if (unescaped_chars_error) { - return UNESCAPED_CHARS; + buf_block_reader reader(buf, len); + json_structural_indexer indexer(parser.structural_indexes.get()); + while (reader.has_full_block()) { + indexer.step(reader.full_block(), reader); } - parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get()); - /*** - * This is related to https://github.com/simdjson/simdjson/issues/906 - * Basically, we want to make sure that if the parsing continues beyond the last (valid) - * structural character, it quickly stops. - * Only three structural characters can be repeated without triggering an error in JSON: [,] and }. - * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing - * continues, then it must be [,] or }. - * Suppose it is ] or }. We backtrack to the first character, what could it be that would - * not trigger an error? It could be ] or } but no, because you can't start a document that way. - * It can't be a comma, a colon or any simple value. So the only way we could continue is - * if the repeated character is [. But if so, the document must start with [. But if the document - * starts with [, it should end with ]. If we enforce that rule, then we would get - * ][[ which is invalid. - **/ - parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len); - parser.structural_indexes[parser.n_structural_indexes + 2] = 0; - parser.next_structural_index = 0; - // a valid JSON file cannot have zero structural indexes - we should have found something - if (unlikely(parser.n_structural_indexes == 0u)) { - return EMPTY; - } - if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) { - return UNEXPECTED_ERROR; + if (likely(reader.has_remainder())) { + uint8_t block[STEP_SIZE]; + reader.get_remainder(block); + indexer.step(block, reader); } - if (partial) { - auto new_structural_indexes = find_next_document_index(parser); - if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) { - return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse. - } - parser.n_structural_indexes = new_structural_indexes; - } - return checker.errors(); + + return indexer.finish(parser, reader.block_index(), len, streaming); } } // namespace stage1 /* end file src/generic/stage1/json_structural_indexer.h */ -WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept { - this->buf = _buf; - this->len = _len; - return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming); +WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept { + return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming); } } // namespace westmere + } // namespace simdjson UNTARGET_REGION - -// -// Stage 2 -// +/* end file src/generic/stage1/json_structural_indexer.h */ +/* begin file src/westmere/stage2.cpp */ +/* westmere/implementation.h already included: #include "westmere/implementation.h" */ /* begin file src/westmere/stringparsing.h */ #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H #define SIMDJSON_WESTMERE_STRINGPARSING_H @@ -12667,10 +12130,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) { // If you consume a large value and you map it to "infinity", you will no // longer be able to serialize back a standard-compliant JSON. And there is // no realistic application where you might need values so large than they - // can't fit in binary64. The maximal value is about 1.7976931348623157 x + // can't fit in binary64. The maximal value is about 1.7976931348623157 × // 10^308 It is an unimaginable large number. There will never be any piece of // engineering involving as many as 10^308 parts. It is estimated that there - // are about 10^80 atoms in the universe. The estimate for the total number + // are about 10^80 atoms in the universe.  The estimate for the total number // of electrons is similar. Using a double-precision floating-point value, we // can represent easily the number of atoms in the universe. We could also // represent the number of ways you can pick any three individual atoms at @@ -12690,6 +12153,26 @@ really_inline bool is_integer(char c) { // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers } +// We need to check that the character following a zero is valid. This is +// probably frequent and it is harder than it looks. We are building all of this +// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)... +const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +really_inline bool +is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) { + return structural_or_whitespace_or_exponent_or_decimal_negated[c]; +} // check quickly whether the next 8 chars are made of digits // at a glance, it looks better than Mula's @@ -12767,14 +12250,14 @@ never_inline bool parse_large_integer(const uint8_t *const src, // as a positive signed integer, but the negative version is // possible. constexpr int64_t signed_answer = INT64_MIN; - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif } else { // we can negate safely int64_t signed_answer = -static_cast(i); - writer.append_s64(signed_answer); + writer.write_s64(signed_answer); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(signed_answer, src); #endif @@ -12787,12 +12270,12 @@ never_inline bool parse_large_integer(const uint8_t *const src, #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif - writer.append_s64(i); + writer.write_s64(i); } else { #ifdef JSON_TEST_NUMBERS // for unit testing found_unsigned_integer(i, src); #endif - writer.append_u64(i); + writer.write_u64(i); } } return is_structural_or_whitespace(*p); @@ -12802,7 +12285,7 @@ template bool slow_float_parsing(UNUSED const char * src, W writer) { double d; if (parse_float_strtod(src, &d)) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, (const uint8_t *)src); #endif @@ -12826,10 +12309,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) { template really_inline bool parse_number(UNUSED const uint8_t *const src, UNUSED bool found_minus, - W &writer) { + W writer) { #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes // useful to skip parsing - writer.append_s64(0); // always write zero + writer.write_s64(0); // always write zero return true; // always succeeds #else const char *p = reinterpret_cast(src); @@ -12849,7 +12332,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == '0') { // 0 cannot be followed by an integer ++p; - if (is_integer(*p)) { + if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) { #ifdef JSON_TEST_NUMBERS // for unit testing found_invalid_number(src); #endif @@ -12973,7 +12456,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, } // we over-decrement by one when there is a '.' digit_count -= int(start - start_digits); - if (unlikely(digit_count >= 19)) { + if (digit_count >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! @@ -12981,22 +12464,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, // 10000000000000000000000000000000000000000000e+308 // 3.1415926535897932384626433832795028841971693993751 // - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } } if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) || (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! - bool success = slow_float_parsing((const char *) src, writer); - // The number was already written, but we made a copy of the writer when we passed it to the - // slow_float_parsing() function, so we have to skip those tape spots now that we've returned - writer.skip_double(); - return success; + return slow_float_parsing((const char *) src, writer); } bool success = true; double d = compute_float_64(exponent, i, negative, &success); @@ -13005,7 +12480,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, success = parse_float_strtod((const char *)src, &d); } if (success) { - writer.append_double(d); + writer.write_double(d); #ifdef JSON_TEST_NUMBERS // for unit testing found_float(d, src); #endif @@ -13020,14 +12495,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src, if (unlikely(digit_count >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. - bool success = parse_large_integer(src, writer, found_minus); - // The number was already written, but we made a copy of the writer - // when we passed it to the parse_large_integer() function, so - writer.skip_large_integer(); - return success; + return parse_large_integer(src, writer, found_minus); } i = negative ? 0 - i : i; - writer.append_s64(i); + writer.write_s64(i); #ifdef JSON_TEST_NUMBERS // for unit testing found_integer(i, src); #endif @@ -13052,72 +12523,6 @@ TARGET_WESTMERE namespace simdjson { namespace westmere { -/* begin file src/generic/stage2/logger.h */ -// This is for an internal-only stage 2 specific logger. -// Set LOG_ENABLED = true to log what stage 2 is doing! -namespace logger { - static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"; - - static constexpr const bool LOG_ENABLED = false; - static constexpr const int LOG_EVENT_LEN = 30; - static constexpr const int LOG_BUFFER_LEN = 20; - static constexpr const int LOG_DETAIL_LEN = 50; - static constexpr const int LOG_INDEX_LEN = 10; - - static int log_depth; // Not threadsafe. Log only. - - // Helper to turn unprintable or newline characters into spaces - static really_inline char printable_char(char c) { - if (c >= 0x20) { - return c; - } else { - return ' '; - } - } - - // Print the header and set up log_start - static really_inline void log_start() { - if (LOG_ENABLED) { - log_depth = 0; - printf("\n"); - printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index"); - printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES); - } - } - - static really_inline void log_string(const char *message) { - if (LOG_ENABLED) { - printf("%s\n", message); - } - } - - // Logs a single line of - template - static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) { - if (LOG_ENABLED) { - printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title); - { - // Print the next N characters in the buffer. - printf("| "); - // Otherwise, print the characters starting from the buffer position. - // Print spaces for unprintable or newline characters. - for (int i=0;i really_inline bool with_space_terminated_copy(const F& f) { @@ -13220,25 +12617,32 @@ class structural_iterator { * practice unless you are in the strange scenario where you have many JSON * documents made of single atoms. */ - char *copy = static_cast(malloc(parser.len + SIMDJSON_PADDING)); + char *copy = static_cast(malloc(len + SIMDJSON_PADDING)); if (copy == nullptr) { return true; } - memcpy(copy, buf, parser.len); - memset(copy + parser.len, ' ', SIMDJSON_PADDING); - bool result = f(reinterpret_cast(copy), *current_structural); + memcpy(copy, buf, len); + memset(copy + len, ' ', SIMDJSON_PADDING); + bool result = f(reinterpret_cast(copy), idx); free(copy); return result; } really_inline bool past_end(uint32_t n_structural_indexes) { - return current_structural >= &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 > n_structural_indexes; } really_inline bool at_end(uint32_t n_structural_indexes) { - return current_structural == &parser.structural_indexes[n_structural_indexes]; + return next_structural+1 == n_structural_indexes; } - really_inline bool at_beginning() { - return current_structural == parser.structural_indexes.get(); + really_inline size_t next_structural_index() { + return next_structural; } + + const uint8_t* const buf; + const size_t len; + const uint32_t* const structural_indexes; + size_t next_structural; // next structural index + size_t idx{0}; // location of the structural character in the input (buf) + uint8_t c{0}; // used to track the (structural) character we are looking at }; } // namespace stage2 @@ -13250,105 +12654,8 @@ class structural_iterator { // "simdjson/stage2.h" (this simplifies amalgation) namespace stage2 { -namespace { // Make everything here private - -/* begin file src/generic/stage2/tape_writer.h */ -struct tape_writer { - /** The next place to write to tape */ - uint64_t *next_tape_loc; - - /** Write a signed 64-bit value to tape. */ - really_inline void append_s64(int64_t value) noexcept; - - /** Write an unsigned 64-bit value to tape. */ - really_inline void append_u64(uint64_t value) noexcept; - - /** Write a double value to tape. */ - really_inline void append_double(double value) noexcept; - - /** - * Append a tape entry (an 8-bit type,and 56 bits worth of value). - */ - really_inline void append(uint64_t val, internal::tape_type t) noexcept; - - /** - * Skip the current tape entry without writing. - * - * Used to skip the start of the container, since we'll come back later to fill it in when the - * container ends. - */ - really_inline void skip() noexcept; - - /** - * Skip the number of tape entries necessary to write a large u64 or i64. - */ - really_inline void skip_large_integer() noexcept; - - /** - * Skip the number of tape entries necessary to write a double. - */ - really_inline void skip_double() noexcept; - - /** - * Write a value to a known location on tape. - * - * Used to go back and write out the start of a container after the container ends. - */ - really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept; - -private: - /** - * Append both the tape entry, and a supplementary value following it. Used for types that need - * all 64 bits, such as double and uint64_t. - */ - template - really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept; -}; // struct number_writer - -really_inline void tape_writer::append_s64(int64_t value) noexcept { - append2(0, value, internal::tape_type::INT64); -} - -really_inline void tape_writer::append_u64(uint64_t value) noexcept { - append(0, internal::tape_type::UINT64); - *next_tape_loc = value; - next_tape_loc++; -} -/** Write a double value to tape. */ -really_inline void tape_writer::append_double(double value) noexcept { - append2(0, value, internal::tape_type::DOUBLE); -} - -really_inline void tape_writer::skip() noexcept { - next_tape_loc++; -} - -really_inline void tape_writer::skip_large_integer() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::skip_double() noexcept { - next_tape_loc += 2; -} - -really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept { - *next_tape_loc = val | ((uint64_t(char(t))) << 56); - next_tape_loc++; -} - -template -really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept { - append(val, t); - static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!"); - memcpy(next_tape_loc, &val2, sizeof(val2)); - next_tape_loc++; -} - -really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept { - tape_loc = val | ((uint64_t(char(t))) << 56); -} -/* end file src/generic/stage2/tape_writer.h */ +using internal::ret_address; #ifdef SIMDJSON_USE_COMPUTED_GOTO #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue } @@ -13379,88 +12686,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal #endif // SIMDJSON_USE_COMPUTED_GOTO struct unified_machine_addresses { - ret_address_t array_begin; - ret_address_t array_continue; - ret_address_t error; - ret_address_t finish; - ret_address_t object_begin; - ret_address_t object_continue; + ret_address array_begin; + ret_address array_continue; + ret_address error; + ret_address finish; + ret_address object_begin; + ret_address object_continue; }; #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } } -struct structural_parser : structural_iterator { - /** Lets you append to the tape */ - tape_writer tape; +struct number_writer { + parser &doc_parser; + + really_inline void write_s64(int64_t value) noexcept { + write_tape(0, internal::tape_type::INT64); + std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value)); + ++doc_parser.current_loc; + } + really_inline void write_u64(uint64_t value) noexcept { + write_tape(0, internal::tape_type::UINT64); + doc_parser.doc.tape[doc_parser.current_loc++] = value; + } + really_inline void write_double(double value) noexcept { + write_tape(0, internal::tape_type::DOUBLE); + static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size"); + memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double)); + // doc.tape[doc.current_loc++] = *((uint64_t *)&d); + } + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); + } +}; // struct number_writer + +struct structural_parser { + structural_iterator structurals; + parser &doc_parser; /** Next write location in the string buf for stage 2 parsing */ - uint8_t *current_string_buf_loc; - /** Current depth (nested objects and arrays) */ - uint32_t depth{0}; - - // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations - really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index) - : structural_iterator(_parser, start_structural_index), - tape{parser.doc->tape.get()}, - current_string_buf_loc{parser.doc->string_buf.get()} { - } - - WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) { - parser.containing_scope[depth].tape_index = next_tape_index(); - parser.containing_scope[depth].count = 0; - tape.skip(); // We don't actually *write* the start element until the end. - parser.ret_address[depth] = continue_state; + uint8_t *current_string_buf_loc{}; + uint32_t depth; + + really_inline structural_parser( + const uint8_t *buf, + size_t len, + parser &_doc_parser, + uint32_t next_structural = 0 + ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {} + + WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) { + doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc; + doc_parser.containing_scope[depth].count = 0; + write_tape(0, type); // if the document is correct, this gets rewritten later + doc_parser.ret_address[depth] = continue_state; depth++; - bool exceeded_max_depth = depth >= parser.max_depth(); - if (exceeded_max_depth) { log_error("Exceeded max depth!"); } - return exceeded_max_depth; + return depth >= doc_parser.max_depth(); } - WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) { - log_start_value("document"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_document(ret_address continue_state) { + return start_scope(internal::tape_type::ROOT, continue_state); } - WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) { - log_start_value("object"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_object(ret_address continue_state) { + return start_scope(internal::tape_type::START_OBJECT, continue_state); } - WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) { - log_start_value("array"); - return start_scope(continue_state); + WARN_UNUSED really_inline bool start_array(ret_address continue_state) { + return start_scope(internal::tape_type::START_ARRAY, continue_state); } // this function is responsible for annotating the start of the scope - really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept { + really_inline void end_scope(internal::tape_type type) noexcept { depth--; - // write our doc->tape location to the header scope + // write our doc.tape location to the header scope // The root scope gets written *at* the previous location. - tape.append(parser.containing_scope[depth].tape_index, end); + write_tape(doc_parser.containing_scope[depth].tape_index, type); // count can overflow if it exceeds 24 bits... so we saturate // the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff). - const uint32_t start_tape_index = parser.containing_scope[depth].tape_index; - const uint32_t count = parser.containing_scope[depth].count; + const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index; + const uint32_t count = doc_parser.containing_scope[depth].count; const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count; - // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index] - tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start); - } - - really_inline uint32_t next_tape_index() { - return uint32_t(tape.next_tape_loc - parser.doc->tape.get()); + // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index] + doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32); } really_inline void end_object() { - log_end_value("object"); - end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT); + end_scope(internal::tape_type::END_OBJECT); } really_inline void end_array() { - log_end_value("array"); - end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY); + end_scope(internal::tape_type::END_ARRAY); } really_inline void end_document() { - log_end_value("document"); - end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT); + end_scope(internal::tape_type::ROOT); + } + + really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept { + doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56); } // increment_count increments the count of keys in an object or values in an array. @@ -13468,16 +12789,17 @@ struct structural_parser : structural_iterator { // must be increment in the preceding depth (depth-1) where the array or // the object resides. really_inline void increment_count() { - parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 + doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1 } really_inline uint8_t *on_start_string() noexcept { - // we advance the point, accounting for the fact that we have a NULL termination - tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING); + /* we advance the point, accounting for the fact that we have a NULL + * termination */ + write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING); return current_string_buf_loc + sizeof(uint32_t); } - really_inline void on_end_string(uint8_t *dst) noexcept { + really_inline bool on_end_string(uint8_t *dst) noexcept { uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t))); // TODO check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB @@ -13487,49 +12809,73 @@ struct structural_parser : structural_iterator { // be NULL terminated? It comes at a small cost *dst = 0; current_string_buf_loc = dst + 1; + return true; } - WARN_UNUSED really_inline bool parse_string(bool key = false) { - log_value(key ? "key" : "string"); + WARN_UNUSED really_inline bool parse_string() { uint8_t *dst = on_start_string(); - dst = stringparsing::parse_string(current(), dst); + dst = stringparsing::parse_string(structurals.current(), dst); if (dst == nullptr) { - log_error("Invalid escape in string"); return true; } - on_end_string(dst); - return false; + return !on_end_string(dst); } WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) { - log_value("number"); - bool succeeded = numberparsing::parse_number(src, found_minus, tape); - if (!succeeded) { log_error("Invalid number"); } - return !succeeded; + number_writer writer{doc_parser}; + return !numberparsing::parse_number(src, found_minus, writer); } WARN_UNUSED really_inline bool parse_number(bool found_minus) { - return parse_number(current(), found_minus); + return parse_number(structurals.current(), found_minus); + } + + WARN_UNUSED really_inline bool parse_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; + } + + WARN_UNUSED really_inline bool parse_single_atom() { + switch (structurals.current_char()) { + case 't': + if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::TRUE_VALUE); + break; + case 'f': + if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::FALSE_VALUE); + break; + case 'n': + if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; } + write_tape(0, internal::tape_type::NULL_VALUE); + break; + default: + return true; + } + return false; } - WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) { - switch (advance_char()) { + WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) { + switch (structurals.current_char()) { case '"': FAIL_IF( parse_string() ); return continue_state; - case 't': - log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(current()) ); - tape.append(0, internal::tape_type::TRUE_VALUE); - return continue_state; - case 'f': - log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(current()) ); - tape.append(0, internal::tape_type::FALSE_VALUE); - return continue_state; - case 'n': - log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(current()) ); - tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parse_atom() ); return continue_state; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -13545,27 +12891,40 @@ struct structural_parser : structural_iterator { FAIL_IF( start_array(continue_state) ); return addresses.array_begin; default: - log_error("Non-value found when value was expected!"); return addresses.error; } } WARN_UNUSED really_inline error_code finish() { + // the string might not be NULL terminated. + if ( !structurals.at_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } end_document(); - parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]); - if (depth != 0) { - log_error("Unclosed objects or arrays!"); - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); } - return SUCCESS; + return on_success(SUCCESS); + } + + really_inline error_code on_error(error_code new_error_code) noexcept { + doc_parser.error = new_error_code; + return new_error_code; + } + really_inline error_code on_success(error_code success_code) noexcept { + doc_parser.error = success_code; + doc_parser.valid = true; + return success_code; } WARN_UNUSED really_inline error_code error() { - /* We do not need the next line because this is done by parser.init_stage2(), + /* We do not need the next line because this is done by doc_parser.init_stage2(), * pessimistically. - * parser.is_valid = false; + * doc_parser.is_valid = false; * At this point in the code, we have all the time in the world. * Note that we know exactly where we are in the document so we could, * without any overhead on the processing code, report a specific @@ -13573,12 +12932,12 @@ struct structural_parser : structural_iterator { * We could even trigger special code paths to assess what happened * carefully, * all without any added cost. */ - if (depth >= parser.max_depth()) { - return parser.error = DEPTH_ERROR; + if (depth >= doc_parser.max_depth()) { + return on_error(DEPTH_ERROR); } - switch (current_char()) { + switch (structurals.current_char()) { case '"': - return parser.error = STRING_ERROR; + return on_error(STRING_ERROR); case '0': case '1': case '2': @@ -13590,124 +12949,92 @@ struct structural_parser : structural_iterator { case '8': case '9': case '-': - return parser.error = NUMBER_ERROR; + return on_error(NUMBER_ERROR); case 't': - return parser.error = T_ATOM_ERROR; + return on_error(T_ATOM_ERROR); case 'n': - return parser.error = N_ATOM_ERROR; + return on_error(N_ATOM_ERROR); case 'f': - return parser.error = F_ATOM_ERROR; + return on_error(F_ATOM_ERROR); default: - return parser.error = TAPE_ERROR; + return on_error(TAPE_ERROR); } } really_inline void init() { - log_start(); - parser.error = UNINITIALIZED; + current_string_buf_loc = doc_parser.doc.string_buf.get(); + doc_parser.current_loc = 0; + doc_parser.valid = false; + doc_parser.error = UNINITIALIZED; } - WARN_UNUSED really_inline error_code start(ret_address_t finish_state) { - // If there are no structurals left, return EMPTY - if (at_end(parser.n_structural_indexes)) { - return parser.error = EMPTY; + WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) { + init(); // sets is_valid to false + if (len > doc_parser.capacity()) { + return CAPACITY; } - - init(); + // Advance to the first character as soon as possible + structurals.advance_char(); // Push the root scope (there is always at least one scope) if (start_document(finish_state)) { - return parser.error = DEPTH_ERROR; + return on_error(DEPTH_ERROR); } return SUCCESS; } - really_inline void log_value(const char *type) { - logger::log_line(*this, "", type, ""); - } - - static really_inline void log_start() { - logger::log_start(); - } - - really_inline void log_start_value(const char *type) { - logger::log_line(*this, "+", type, ""); - if (logger::LOG_ENABLED) { logger::log_depth++; } - } - - really_inline void log_end_value(const char *type) { - if (logger::LOG_ENABLED) { logger::log_depth--; } - logger::log_line(*this, "-", type, ""); - } - - really_inline void log_error(const char *error) { - logger::log_line(*this, "", "ERROR", error); + really_inline char advance_char() { + return structurals.advance_char(); } -}; // struct structural_parser +}; // Redefine FAIL_IF to use goto since it'll be used inside the function now #undef FAIL_IF #define FAIL_IF(EXPR) { if (EXPR) { goto error; } } -template -WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept { - dom_parser.doc = &doc; +} // namespace stage2 + +/************ + * The JSON is parsed to a tape, see the accompanying tape.md file + * for documentation. + ***********/ +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); - stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0); - error_code result = parser.start(addresses.finish); + stage2::structural_parser parser(buf, len, doc_parser); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } // // Read first value // - switch (parser.current_char()) { + switch (parser.structurals.current_char()) { case '{': FAIL_IF( parser.start_object(addresses.finish) ); goto object_begin; case '[': FAIL_IF( parser.start_array(addresses.finish) ); - // Make sure the outer array is closed before continuing; otherwise, there are ways we could get - // into memory corruption. See https://github.com/simdjson/simdjson/issues/906 - if (!STREAMING) { - if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') { - goto error; - } - } goto array_begin; case '"': FAIL_IF( parser.parse_string() ); goto finish; - case 't': - parser.log_value("true"); - FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::TRUE_VALUE); - goto finish; - case 'f': - parser.log_value("false"); - FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::FALSE_VALUE); - goto finish; - case 'n': - parser.log_value("null"); - FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) ); - parser.tape.append(0, internal::tape_type::NULL_VALUE); + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); goto finish; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], false); }) ); goto finish; case '-': FAIL_IF( - parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { return parser.parse_number(©[idx], true); }) ); goto finish; default: - parser.log_error("Document starts with a non-value character"); goto error; } @@ -13718,45 +13045,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case '"': { parser.increment_count(); - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.parse_string() ); goto object_key_state; } case '}': parser.end_object(); goto scope_end; default: - parser.log_error("Object does not start with a key"); goto error; } object_key_state: - if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; } + FAIL_IF( parser.advance_char() != ':' ); + parser.advance_char(); GOTO( parser.parse_value(addresses, addresses.object_continue) ); object_continue: switch (parser.advance_char()) { case ',': parser.increment_count(); - if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; } - FAIL_IF( parser.parse_string(true) ); + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); goto object_key_state; case '}': parser.end_object(); goto scope_end; default: - parser.log_error("No comma between object fields"); goto error; } scope_end: - CONTINUE( parser.parser.ret_address[parser.depth] ); + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); // // Array parser states // array_begin: - if (parser.peek_next_char() == ']') { - parser.advance_char(); + if (parser.advance_char() == ']') { parser.end_array(); goto scope_end; } @@ -13771,12 +13096,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p switch (parser.advance_char()) { case ',': parser.increment_count(); + parser.advance_char(); goto main_array_switch; case ']': parser.end_array(); goto scope_end; default: - parser.log_error("Missing comma between array values"); goto error; } @@ -13787,46 +13112,176 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p return parser.error(); } -} // namespace {} +WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept { + error_code code = stage1(buf, len, doc_parser, false); + if (!code) { + code = stage2(buf, len, doc_parser); + } + return code; +} +/* end file src/generic/stage2/structural_parser.h */ +/* begin file src/generic/stage2/streaming_structural_parser.h */ +namespace stage2 { + +struct streaming_structural_parser: structural_parser { + really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {} + + // override to add streaming + WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) { + init(); // sets is_valid to false + // Capacity ain't no thang for streaming, so we don't check it. + // Advance to the first character as soon as possible + advance_char(); + // Push the root scope (there is always at least one scope) + if (start_document(finish_parser)) { + return on_error(DEPTH_ERROR); + } + return SUCCESS; + } + + // override to add streaming + WARN_UNUSED really_inline error_code finish() { + if ( structurals.past_end(doc_parser.n_structural_indexes) ) { + return on_error(TAPE_ERROR); + } + end_document(); + if (depth != 0) { + return on_error(TAPE_ERROR); + } + if (doc_parser.containing_scope[depth].tape_index != 0) { + return on_error(TAPE_ERROR); + } + bool finished = structurals.at_end(doc_parser.n_structural_indexes); + return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE); + } +}; + } // namespace stage2 /************ * The JSON is parsed to a tape, see the accompanying tape.md file * for documentation. ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept { - error_code result = stage2::parse_structurals(*this, _doc); +WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept { + static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES(); + stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json)); + error_code result = parser.start(len, addresses.finish); if (result) { return result; } + // + // Read first value + // + switch (parser.structurals.current_char()) { + case '{': + FAIL_IF( parser.start_object(addresses.finish) ); + goto object_begin; + case '[': + FAIL_IF( parser.start_array(addresses.finish) ); + goto array_begin; + case '"': + FAIL_IF( parser.parse_string() ); + goto finish; + case 't': case 'f': case 'n': + FAIL_IF( parser.parse_single_atom() ); + goto finish; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], false); + }) + ); + goto finish; + case '-': + FAIL_IF( + parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) { + return parser.parse_number(©[idx], true); + }) + ); + goto finish; + default: + goto error; + } - // If we didn't make it to the end, it's an error - if ( next_structural_index != n_structural_indexes ) { - logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!"); - return error = TAPE_ERROR; +// +// Object parser parsers +// +object_begin: + switch (parser.advance_char()) { + case '"': { + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + } + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; } - return SUCCESS; -} +object_key_parser: + FAIL_IF( parser.advance_char() != ':' ); + parser.increment_count(); + parser.advance_char(); + GOTO( parser.parse_value(addresses, addresses.object_continue) ); -/************ - * The JSON is parsed to a tape, see the accompanying tape.md file - * for documentation. - ***********/ -WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept { - return stage2::parse_structurals(*this, _doc); -} -/* end file src/generic/stage2/tape_writer.h */ +object_continue: + switch (parser.advance_char()) { + case ',': + FAIL_IF( parser.advance_char() != '"' ); + FAIL_IF( parser.parse_string() ); + goto object_key_parser; + case '}': + parser.end_object(); + goto scope_end; + default: + goto error; + } + +scope_end: + CONTINUE( parser.doc_parser.ret_address[parser.depth] ); + +// +// Array parser parsers +// +array_begin: + if (parser.advance_char() == ']') { + parser.end_array(); + goto scope_end; + } + parser.increment_count(); + +main_array_switch: + /* we call update char on all paths in, so we can peek at parser.c on the + * on paths that can accept a close square brace (post-, and at start) */ + GOTO( parser.parse_value(addresses, addresses.array_continue) ); + +array_continue: + switch (parser.advance_char()) { + case ',': + parser.increment_count(); + parser.advance_char(); + goto main_array_switch; + case ']': + parser.end_array(); + goto scope_end; + default: + goto error; + } + +finish: + next_json = parser.structurals.next_structural_index(); + return parser.finish(); -WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept { - error_code err = stage1(_buf, _len, false); - if (err) { return err; } - return stage2(_doc); +error: + return parser.error(); } +/* end file src/generic/stage2/streaming_structural_parser.h */ } // namespace westmere } // namespace simdjson UNTARGET_REGION -/* end file src/generic/stage2/tape_writer.h */ +/* end file src/generic/stage2/streaming_structural_parser.h */ #endif SIMDJSON_POP_DISABLE_WARNINGS -/* end file src/generic/stage2/tape_writer.h */ +/* end file src/generic/stage2/streaming_structural_parser.h */ diff --git a/inst/include/simdjson.h b/inst/include/simdjson.h index 21efa8e..0a1d140 100644 --- a/inst/include/simdjson.h +++ b/inst/include/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */ +/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */ /* begin file include/simdjson.h */ #ifndef SIMDJSON_H #define SIMDJSON_H @@ -2030,6 +2030,7 @@ namespace simdjson { */ enum error_code { SUCCESS = 0, ///< No error + SUCCESS_AND_HAS_MORE, ///< @private No error and buffer still has more data CAPACITY, ///< This parser can't support a document that big MEMALLOC, ///< Error allocating memory, most likely out of memory TAPE_ERROR, ///< Something went wrong while writing to the tape (stage 2), this is a generic error @@ -2408,187 +2409,6 @@ inline char *allocate_padded_buffer(size_t length) noexcept; #ifndef SIMDJSON_IMPLEMENTATION_H #define SIMDJSON_IMPLEMENTATION_H -/* begin file include/simdjson/internal/dom_parser_implementation.h */ -#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H -#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H - -#include - -namespace simdjson { - -namespace dom { -class document; -} // namespace dom - -namespace internal { - -/** - * An implementation of simdjson's DOM parser for a particular CPU architecture. - * - * This class is expected to be accessed only by pointer, and never move in memory (though the - * pointer can move). - */ -class dom_parser_implementation { -public: - - /** - * @private For internal implementation use - * - * Run a full JSON parse on a single document (stage1 + stage2). - * - * Guaranteed only to be called when capacity > document length. - * - * Overridden by each implementation. - * - * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. - * @param len The length of the json document. - * @return The error code, or SUCCESS if there was no error. - */ - WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0; - - /** - * @private For internal implementation use - * - * Stage 1 of the document parser. - * - * Guaranteed only to be called when capacity > document length. - * - * Overridden by each implementation. - * - * @param buf The json document to parse. - * @param len The length of the json document. - * @param streaming Whether this is being called by parser::parse_many. - * @return The error code, or SUCCESS if there was no error. - */ - WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0; - - /** - * @private For internal implementation use - * - * Stage 2 of the document parser. - * - * Called after stage1(). - * - * Overridden by each implementation. - * - * @param doc The document to output to. - * @return The error code, or SUCCESS if there was no error. - */ - WARN_UNUSED virtual error_code stage2(dom::document &doc) noexcept = 0; - - /** - * @private For internal implementation use - * - * Stage 2 of the document parser for parser::parse_many. - * - * Guaranteed only to be called after stage1(). - * Overridden by each implementation. - * - * @param doc The document to output to. - * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed. - */ - WARN_UNUSED virtual error_code stage2_next(dom::document &doc) noexcept = 0; - - /** - * Change the capacity of this parser. - * - * Generally used for reallocation. - * - * @param capacity The new capacity. - * @param max_depth The new max_depth. - * @return The error code, or SUCCESS if there was no error. - */ - virtual error_code set_capacity(size_t capacity) noexcept = 0; - - /** - * Change the max depth of this parser. - * - * Generally used for reallocation. - * - * @param capacity The new capacity. - * @param max_depth The new max_depth. - * @return The error code, or SUCCESS if there was no error. - */ - virtual error_code set_max_depth(size_t max_depth) noexcept = 0; - - /** - * Deallocate this parser. - */ - virtual ~dom_parser_implementation() = default; - - /** Number of structural indices passed from stage 1 to stage 2 */ - uint32_t n_structural_indexes{0}; - /** Structural indices passed from stage 1 to stage 2 */ - std::unique_ptr structural_indexes{}; - /** Next structural index to parse */ - uint32_t next_structural_index{0}; - - /** - * The largest document this parser can support without reallocating. - * - * @return Current capacity, in bytes. - */ - really_inline size_t capacity() const noexcept; - - /** - * The maximum level of nested object and arrays supported by this parser. - * - * @return Maximum depth, in bytes. - */ - really_inline size_t max_depth() const noexcept; - - /** - * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length - * and `max_depth` depth. - * - * @param capacity The new capacity. - * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. - * @return The error, if there is one. - */ - WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth) noexcept; - -protected: - /** - * The maximum document length this parser supports. - * - * Buffers are large enough to handle any document up to this length. - */ - size_t _capacity{0}; - - /** - * The maximum depth (number of nested objects and arrays) supported by this parser. - * - * Defaults to DEFAULT_MAX_DEPTH. - */ - size_t _max_depth{0}; -}; // class dom_parser_implementation - -really_inline size_t dom_parser_implementation::capacity() const noexcept { - return _capacity; -} - -really_inline size_t dom_parser_implementation::max_depth() const noexcept { - return _max_depth; -} - -WARN_UNUSED -inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept { - if (this->max_depth() != max_depth) { - error_code err = set_max_depth(max_depth); - if (err) { return err; } - } - if (_capacity != capacity) { - error_code err = set_capacity(capacity); - if (err) { return err; } - } - return SUCCESS; -} - -} // namespace internal -} // namespace simdjson - -#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H -/* end file include/simdjson/internal/dom_parser_implementation.h */ #include #include #include @@ -2597,8 +2417,8 @@ inline error_code dom_parser_implementation::allocate(size_t capacity, size_t ma namespace simdjson { namespace dom { - class document; -} // namespace dom + class parser; +} /** * An implementation of simdjson for a particular CPU architecture. @@ -2641,19 +2461,16 @@ class implementation { /** * @private For internal implementation use * - * const implementation *impl = simdjson::active_implementation; - * cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl; + * Run a full document parse (ensure_capacity, stage1 and stage2). * - * @param capacity The largest document that will be passed to the parser. - * @param max_depth The maximum JSON object/array nesting this parser is expected to handle. - * @param dst The place to put the resulting parser implementation. - * @return the name of the implementation, e.g. "haswell", "westmere", "arm64" + * Overridden by each implementation. + * + * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len the length of the json document. + * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. + * @return the error code, or SUCCESS if there was no error. */ - virtual error_code create_dom_parser_implementation( - size_t capacity, - size_t max_depth, - std::unique_ptr &dst - ) const noexcept = 0; + WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0; /** * @private For internal implementation use @@ -2670,6 +2487,50 @@ class implementation { */ WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0; + /** + * @private For internal implementation use + * + * Stage 1 of the document parser. + * + * Overridden by each implementation. + * + * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len the length of the json document. + * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. + * @param streaming whether this is being called by parser::parse_many. + * @return the error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser. + * + * Overridden by each implementation. + * + * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len the length of the json document. + * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. + * @return the error code, or SUCCESS if there was no error. + */ + WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0; + + /** + * @private For internal implementation use + * + * Stage 2 of the document parser for parser::parse_many. + * + * Overridden by each implementation. + * + * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. + * @param len the length of the json document. + * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity. + * @param next_json the next structural index. Start this at 0 the first time, and it will be updated to the next value to pass each time. + * @return the error code, SUCCESS if there was no error, or SUCCESS_AND_HAS_MORE if there was no error and stage2 can be called again. + */ + WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept = 0; + protected: /** @private Construct an implementation with the given name and description. For subclasses. */ really_inline implementation( @@ -2787,7 +2648,7 @@ extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr activ } // namespace simdjson #endif // SIMDJSON_IMPLEMENTATION_H -/* end file include/simdjson/internal/dom_parser_implementation.h */ +/* end file include/simdjson/implementation.h */ /* begin file include/simdjson/dom/array.h */ #ifndef SIMDJSON_DOM_ARRAY_H #define SIMDJSON_DOM_ARRAY_H @@ -3161,6 +3022,22 @@ class document { namespace simdjson { +namespace internal { + +// expectation: sizeof(scope_descriptor) = 64/8. +struct scope_descriptor { + uint32_t tape_index; // where, on the tape, does the scope ([,{) begins + uint32_t count; // how many elements in the scope +}; // struct scope_descriptor + +#ifdef SIMDJSON_USE_COMPUTED_GOTO +typedef void* ret_address; +#else +typedef char ret_address; +#endif + +} // namespace internal + namespace dom { class document_stream; @@ -3198,14 +3075,14 @@ class parser { * * @param other The parser to take. Its capacity is zeroed. */ - really_inline parser(parser &&other) noexcept; + parser(parser &&other) = default; parser(const parser &) = delete; ///< @private Disallow copying /** * Take another parser's buffers and state. * * @param other The parser to take. Its capacity is zeroed. */ - really_inline parser &operator=(parser &&other) noexcept; + parser &operator=(parser &&other) = default; parser &operator=(const parser &) = delete; ///< @private Disallow copying /** Deallocate the JSON parser. */ @@ -3465,8 +3342,7 @@ class parser { /** * Set max_capacity. This is the largest document this parser can automatically support. * - * The parser may reallocate internal buffers as needed up to this amount as documents are passed - * to it. + * The parser may reallocate internal buffers as needed up to this amount. * * This call will not allocate or deallocate, even if capacity is currently above max_capacity. * @@ -3479,8 +3355,19 @@ class parser { /** @private Use simdjson_error instead */ using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error; - /** @private [for benchmarking access] The implementation to use */ - std::unique_ptr implementation{}; + /** @private Next location to write to in the tape */ + uint32_t current_loc{0}; + + /** @private Number of structural indices passed from stage 1 to stage 2 */ + uint32_t n_structural_indexes{0}; + /** @private Structural indices passed from stage 1 to stage 2 */ + std::unique_ptr structural_indexes{}; + + /** @private Tape location of each open { or [ */ + std::unique_ptr containing_scope{}; + + /** @private Return address of each open { or [ */ + std::unique_ptr ret_address{}; /** @private Use `if (parser.parse(...).error())` instead */ bool valid{false}; @@ -3520,6 +3407,20 @@ class parser { */ size_t _max_capacity; + /** + * The maximum document length this parser supports. + * + * Buffers are large enough to handle any document up to this length. + */ + size_t _capacity{0}; + + /** + * The maximum depth (number of nested objects and arrays) supported by this parser. + * + * Defaults to DEFAULT_MAX_DEPTH. + */ + size_t _max_depth{0}; + /** * The loaded buffer (reused each time load() is called) */ @@ -3599,7 +3500,7 @@ class document_stream { really_inline bool operator!=(const iterator &other) const noexcept; private: - really_inline iterator(document_stream &s, bool finished) noexcept; + iterator(document_stream& stream, bool finished) noexcept; /** The document_stream we're iterating through. */ document_stream& stream; /** Whether we're finished or not. */ @@ -3622,23 +3523,7 @@ class document_stream { document_stream(document_stream &other) = delete; // Disallow copying - /** - * Construct a document_stream. Does not allocate or parse anything until the iterator is - * used. - */ - really_inline document_stream( - dom::parser &parser, - const uint8_t *buf, - size_t len, - size_t batch_size, - error_code error = SUCCESS - ) noexcept; - - /** - * Parse the first document in the buffer. Used by begin(), to handle allocation and - * initialization. - */ - inline void start() noexcept; + really_inline document_stream(dom::parser &parser, const uint8_t *buf, size_t len, size_t batch_size, error_code error = SUCCESS) noexcept; /** * Parse the next document found in the buffer previously given to document_stream. @@ -3651,7 +3536,10 @@ class document_stream { * pre-allocating a capacity defined by the batch_size defined when creating the * document_stream object. * - * The function returns simdjson::EMPTY if there is no more data to be parsed. + * The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in case + * of success and indicates that the buffer still contains more data to be parsed, + * meaning this function can be called again to return the next JSON document + * after this one. * * The function returns simdjson::SUCCESS (as integer = 0) in case of success * and indicates that the buffer has successfully been parsed to the end. @@ -3662,51 +3550,55 @@ class document_stream { * the simdjson::error_message function converts these error codes into a string). * * You can also check validity by calling parser.is_valid(). The same parser can - * and should be reused for the other documents in the buffer. - */ - inline void next() noexcept; + * and should be reused for the other documents in the buffer. */ + inline error_code json_parse() noexcept; /** - * Pass the next batch through stage 1 and return when finished. - * When threads are enabled, this may wait for the stage 1 thread to finish. + * Returns the location (index) of where the next document should be in the + * buffer. + * Can be used for debugging, it tells the user the position of the end of the + * last + * valid JSON document parsed */ - inline void load_batch() noexcept; + inline size_t get_current_buffer_loc() const { return current_buffer_loc; } - /** Get the next document index. */ - inline size_t next_batch_start() const noexcept; - - /** Pass the next batch through stage 1 with the given parser. */ - inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept; - - dom::parser &parser; - const uint8_t *buf; - const size_t len; - const size_t batch_size; - size_t batch_start{0}; - /** The error (or lack thereof) from the current document. */ - error_code error; - -#ifdef SIMDJSON_THREADS_ENABLED - inline void load_from_stage1_thread() noexcept; + /** + * Returns the total amount of complete documents parsed by the document_stream, + * in the current buffer, at the given time. + */ + inline size_t get_n_parsed_docs() const { return n_parsed_docs; } - /** Start a thread to run stage 1 on the next batch. */ - inline void start_stage1_thread() noexcept; + /** + * Returns the total amount of data (in bytes) parsed by the document_stream, + * in the current buffer, at the given time. + */ + inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; } - /** Wait for the stage 1 thread to finish and capture the results. */ - inline void finish_stage1_thread() noexcept; + inline const uint8_t *buf() const { return _buf + buf_start; } - /** The error returned from the stage 1 thread. */ - error_code stage1_thread_error{UNINITIALIZED}; - /** The thread used to run stage 1 against the next batch in the background. */ - std::thread stage1_thread{}; + inline void advance(size_t offset) { buf_start += offset; } - /** - * The parser used to run stage 1 in the background. Will be swapped - * with the regular parser when finished. - */ - dom::parser stage1_thread_parser{}; -#endif // SIMDJSON_THREADS_ENABLED + inline size_t remaining() const { return _len - buf_start; } + dom::parser &parser; + const uint8_t *_buf; + const size_t _len; + size_t _batch_size; // this is actually variable! + size_t buf_start{0}; + size_t next_json{0}; + bool load_next_batch{true}; + size_t current_buffer_loc{0}; +#ifdef SIMDJSON_THREADS_ENABLED + size_t last_json_buffer_loc{0}; +#endif + size_t n_parsed_docs{0}; + size_t n_bytes_parsed{0}; + error_code error{SUCCESS_AND_HAS_MORE}; +#ifdef SIMDJSON_THREADS_ENABLED + error_code stage1_is_ok_thread{SUCCESS}; + std::thread stage_1_thread{}; + dom::parser parser_thread{}; +#endif friend class dom::parser; }; // class document_stream @@ -4951,36 +4843,124 @@ inline std::ostream& operator<<(std::ostream& out, const simdjson_result namespace simdjson { -namespace dom { +namespace internal { + +/** + * This algorithm is used to quickly identify the buffer position of + * the last JSON document inside the current batch. + * + * It does its work by finding the last pair of structural characters + * that represent the end followed by the start of a document. + * + * Simply put, we iterate over the structural characters, starting from + * the end. We consider that we found the end of a JSON document when the + * first element of the pair is NOT one of these characters: '{' '[' ';' ',' + * and when the second element is NOT one of these characters: '}' '}' ';' ','. + * + * This simple comparison works most of the time, but it does not cover cases + * where the batch's structural indexes contain a perfect amount of documents. + * In such a case, we do not have access to the structural index which follows + * the last document, therefore, we do not have access to the second element in + * the pair, and means that we cannot identify the last document. To fix this + * issue, we keep a count of the open and closed curly/square braces we found + * while searching for the pair. When we find a pair AND the count of open and + * closed curly/square braces is the same, we know that we just passed a + * complete + * document, therefore the last json buffer location is the end of the batch + * */ +inline uint32_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const dom::parser &parser) { + // this function can be generally useful + if (parser.n_structural_indexes == 0) + return 0; + auto last_i = parser.n_structural_indexes - 1; + if (parser.structural_indexes[last_i] == size) { + if (last_i == 0) + return 0; + last_i = parser.n_structural_indexes - 2; + } + auto arr_cnt = 0; + auto obj_cnt = 0; + for (auto i = last_i; i > 0; i--) { + auto idxb = parser.structural_indexes[i]; + switch (buf[idxb]) { + case ':': + case ',': + continue; + case '}': + obj_cnt--; + continue; + case ']': + arr_cnt--; + continue; + case '{': + obj_cnt++; + break; + case '[': + arr_cnt++; + break; + } + auto idxa = parser.structural_indexes[i - 1]; + switch (buf[idxa]) { + case '{': + case '[': + case ':': + case ',': + continue; + } + if (!arr_cnt && !obj_cnt) { + return last_i + 1; + } + return i; + } + return 0; +} +// returns true if the provided byte value is an ASCII character +static inline bool is_ascii(char c) { + return ((unsigned char)c) <= 127; +} + +// if the string ends with UTF-8 values, backtrack +// up to the first ASCII character. May return 0. +static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) { + while ((len > 0) and (not is_ascii(c[len - 1]))) { + len--; + } + return len; +} + +} // namespace internal + +} // namespace simdjson + +namespace simdjson { +namespace dom { really_inline document_stream::document_stream( dom::parser &_parser, - const uint8_t *_buf, - size_t _len, - size_t _batch_size, + const uint8_t *buf, + size_t len, + size_t batch_size, error_code _error ) noexcept : parser{_parser}, - buf{_buf}, - len{_len}, - batch_size{_batch_size}, - error{_error} + _buf{buf}, + _len{len}, + _batch_size(batch_size), + error(_error) { + if (!error) { error = json_parse(); } } inline document_stream::~document_stream() noexcept { #ifdef SIMDJSON_THREADS_ENABLED - // TODO kill the thread, why should people have to wait for a non-side-effecting operation to complete - if (stage1_thread.joinable()) { - stage1_thread.join(); + if (stage_1_thread.joinable()) { + stage_1_thread.join(); } #endif } really_inline document_stream::iterator document_stream::begin() noexcept { - start(); - // If there are no documents, we're finished. - return iterator(*this, error == EMPTY); + return iterator(*this, false); } really_inline document_stream::iterator document_stream::end() noexcept { @@ -4992,15 +4972,17 @@ really_inline document_stream::iterator::iterator(document_stream& _stream, bool } really_inline simdjson_result document_stream::iterator::operator*() noexcept { - // Once we have yielded any errors, we're finished. - if (stream.error) { finished = true; return stream.error; } + error_code err = stream.error == SUCCESS_AND_HAS_MORE ? SUCCESS : stream.error; + if (err) { return err; } return stream.parser.doc.root(); } really_inline document_stream::iterator& document_stream::iterator::operator++() noexcept { - stream.next(); - // If that was the last document, we're finished. - if (stream.error == EMPTY) { finished = true; } + if (stream.error == SUCCESS_AND_HAS_MORE) { + stream.error = stream.json_parse(); + } else { + finished = true; + } return *this; } @@ -5008,96 +4990,130 @@ really_inline bool document_stream::iterator::operator!=(const document_stream:: return finished != other.finished; } -inline void document_stream::start() noexcept { - if (error) { return; } - - error = parser.ensure_capacity(batch_size); - if (error) { return; } - - // Always run the first stage 1 parse immediately - batch_start = 0; - error = run_stage1(parser, batch_start); - if (error) { return; } - #ifdef SIMDJSON_THREADS_ENABLED - if (next_batch_start() < len) { - // Kick off the first thread if needed - error = stage1_thread_parser.ensure_capacity(batch_size); - if (error) { return; } - start_stage1_thread(); - if (error) { return; } - } -#endif // SIMDJSON_THREADS_ENABLED - - next(); -} - -inline void document_stream::next() noexcept { - if (error) { return; } - - // Load the next document from the batch - error = parser.implementation->stage2_next(parser.doc); - - // If that was the last document in the batch, load another batch (if available) - while (error == EMPTY) { - batch_start = next_batch_start(); - if (batch_start >= len) { break; } - -#ifdef SIMDJSON_THREADS_ENABLED - load_from_stage1_thread(); -#else - error = run_stage1(parser, batch_start); -#endif - if (error) { continue; } // If the error was EMPTY, we may want to load another batch. - - // Run stage 2 on the first document in the batch - error = parser.implementation->stage2_next(parser.doc); - } -} -inline size_t document_stream::next_batch_start() const noexcept { - return batch_start + parser.implementation->structural_indexes[parser.implementation->n_structural_indexes]; -} - -inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept { - // If this is the final batch, pass partial = false - size_t remaining = len - _batch_start; - if (remaining <= batch_size) { - return p.implementation->stage1(&buf[_batch_start], remaining, false); - } else { - return p.implementation->stage1(&buf[_batch_start], batch_size, true); +// threaded version of json_parse +// todo: simplify this code further +inline error_code document_stream::json_parse() noexcept { + error = parser.ensure_capacity(_batch_size); + if (error) { return error; } + error = parser_thread.ensure_capacity(_batch_size); + if (error) { return error; } + + if (unlikely(load_next_batch)) { + // First time loading + if (!stage_1_thread.joinable()) { + _batch_size = (std::min)(_batch_size, remaining()); + _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size); + if (_batch_size == 0) { + return simdjson::UTF8_ERROR; + } + auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true)); + if (stage1_is_ok != simdjson::SUCCESS) { + return stage1_is_ok; + } + uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser); + if (last_index == 0) { + if (parser.n_structural_indexes == 0) { + return simdjson::EMPTY; + } + } else { + parser.n_structural_indexes = last_index + 1; + } + } + // the second thread is running or done. + else { + stage_1_thread.join(); + if (stage1_is_ok_thread != simdjson::SUCCESS) { + return stage1_is_ok_thread; + } + std::swap(parser.structural_indexes, parser_thread.structural_indexes); + parser.n_structural_indexes = parser_thread.n_structural_indexes; + advance(last_json_buffer_loc); + n_bytes_parsed += last_json_buffer_loc; + } + // let us decide whether we will start a new thread + if (remaining() - _batch_size > 0) { + last_json_buffer_loc = + parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)]; + _batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc); + if (_batch_size > 0) { + _batch_size = internal::trimmed_length_safe_utf8( + (const char *)(buf() + last_json_buffer_loc), _batch_size); + if (_batch_size == 0) { + return simdjson::UTF8_ERROR; + } + // let us capture read-only variables + const uint8_t *const b = buf() + last_json_buffer_loc; + const size_t bs = _batch_size; + // we call the thread on a lambda that will update + // this->stage1_is_ok_thread + // there is only one thread that may write to this value + stage_1_thread = std::thread([this, b, bs] { + this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true)); + }); + } + } + next_json = 0; + load_next_batch = false; + } // load_next_batch + error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json); + if (res == simdjson::SUCCESS_AND_HAS_MORE) { + n_parsed_docs++; + current_buffer_loc = parser.structural_indexes[next_json]; + load_next_batch = (current_buffer_loc == last_json_buffer_loc); + } else if (res == simdjson::SUCCESS) { + n_parsed_docs++; + if (remaining() > _batch_size) { + current_buffer_loc = parser.structural_indexes[next_json - 1]; + load_next_batch = true; + res = simdjson::SUCCESS_AND_HAS_MORE; + } } + return res; } -#ifdef SIMDJSON_THREADS_ENABLED - -inline void document_stream::load_from_stage1_thread() noexcept { - stage1_thread.join(); +#else // SIMDJSON_THREADS_ENABLED - // Swap to the parser that was loaded up in the thread. Make sure the parser has - // enough memory to swap to, as well. - std::swap(parser, stage1_thread_parser); - error = stage1_thread_error; - if (error) { return; } +// single-threaded version of json_parse +inline error_code document_stream::json_parse() noexcept { + error = parser.ensure_capacity(_batch_size); + if (error) { return error; } - // If there's anything left, start the stage 1 thread! - if (next_batch_start() < len) { - start_stage1_thread(); + if (unlikely(load_next_batch)) { + advance(current_buffer_loc); + n_bytes_parsed += current_buffer_loc; + _batch_size = (std::min)(_batch_size, remaining()); + _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size); + auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true); + if (stage1_is_ok != simdjson::SUCCESS) { + return stage1_is_ok; + } + uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser); + if (last_index == 0) { + if (parser.n_structural_indexes == 0) { + return EMPTY; + } + } else { + parser.n_structural_indexes = last_index + 1; + } + load_next_batch = false; + } // load_next_batch + error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json); + if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) { + n_parsed_docs++; + current_buffer_loc = parser.structural_indexes[next_json]; + } else if (res == simdjson::SUCCESS) { + n_parsed_docs++; + if (remaining() > _batch_size) { + current_buffer_loc = parser.structural_indexes[next_json - 1]; + next_json = 1; + load_next_batch = true; + res = simdjson::SUCCESS_AND_HAS_MORE; + } } + return res; } - -inline void document_stream::start_stage1_thread() noexcept { - // we call the thread on a lambda that will update - // this->stage1_thread_error - // there is only one thread that may write to this value - // TODO this is NOT exception-safe. - this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error - size_t _next_batch_start = this->next_batch_start(); - stage1_thread = std::thread([this, _next_batch_start] { - this->stage1_thread_error = run_stage1(this->stage1_thread_parser, _next_batch_start); - }); -} - #endif // SIMDJSON_THREADS_ENABLED } // namespace dom @@ -5136,7 +5152,7 @@ inline error_code document::allocate(size_t capacity) noexcept { // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6" //where len + 1 tape elements are // generated, see issue https://github.com/lemire/simdjson/issues/345 - size_t tape_capacity = ROUNDUP_N(capacity + 3, 64); + size_t tape_capacity = ROUNDUP_N(capacity + 2, 64); // a document with only zero-length strings... could have len/3 string // and we would need len/3 * 5 bytes on the string buffer size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64); @@ -6725,11 +6741,8 @@ namespace dom { // really_inline parser::parser(size_t max_capacity) noexcept : _max_capacity{max_capacity}, - loaded_bytes(nullptr, &aligned_free_char) { -} -really_inline parser::parser(parser &&other) noexcept = default; -really_inline parser &parser::operator=(parser &&other) noexcept = default; - + loaded_bytes(nullptr, &aligned_free_char) + {} inline bool parser::is_valid() const noexcept { return valid; } inline int parser::get_error_code() const noexcept { return error; } inline std::string parser::get_error_message() const noexcept { return error_message(error); } @@ -6812,12 +6825,15 @@ inline simdjson_result parser::parse(const uint8_t *buf, size_t len, bo memcpy((void *)buf, tmp_buf, len); } - code = implementation->parse(buf, len, doc); + code = simdjson::active_implementation->parse(buf, len, *this); if (realloc_if_needed) { aligned_free((void *)buf); // must free before we exit } if (code) { return code; } + // We're indicating validity via the simdjson_result, so set the parse state back to invalid + valid = false; + error = UNINITIALIZED; return doc.root(); } really_inline simdjson_result parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept { @@ -6844,30 +6860,81 @@ inline document_stream parser::parse_many(const padded_string &s, size_t batch_s } really_inline size_t parser::capacity() const noexcept { - return implementation ? implementation->capacity() : 0; + return _capacity; } really_inline size_t parser::max_capacity() const noexcept { return _max_capacity; } really_inline size_t parser::max_depth() const noexcept { - return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH; + return _max_depth; } WARN_UNUSED inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept { // - // Reallocate implementation and document if needed + // If capacity has changed, reallocate capacity-based buffers // - error_code err; - if (implementation) { - err = implementation->allocate(capacity, max_depth); - } else { - err = simdjson::active_implementation->create_dom_parser_implementation(capacity, max_depth, implementation); + if (_capacity != capacity) { + // Set capacity to 0 until we finish, in case there's an error + _capacity = 0; + + // + // Reallocate the document + // + error_code err = doc.allocate(capacity); + if (err) { return err; } + + // + // Don't allocate 0 bytes, just return. + // + if (capacity == 0) { + structural_indexes.reset(); + return SUCCESS; + } + + // + // Initialize stage 1 output + // + size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7; + structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc + if (!structural_indexes) { + return MEMALLOC; + } + + _capacity = capacity; + + // + // If capacity hasn't changed, but the document was taken, allocate a new document. + // + } else if (!doc.tape) { + error_code err = doc.allocate(capacity); + if (err) { return err; } } - if (err) { return err; } - if (implementation->capacity() != capacity || !doc.tape) { - return doc.allocate(capacity); + // + // If max_depth has changed, reallocate those buffers + // + if (max_depth != _max_depth) { + _max_depth = 0; + + if (max_depth == 0) { + ret_address.reset(); + containing_scope.reset(); + return SUCCESS; + } + + // + // Initialize stage 2 state + // + containing_scope.reset(new (std::nothrow) internal::scope_descriptor[max_depth]); // TODO realloc + ret_address.reset(new (std::nothrow) internal::ret_address[max_depth]); + + if (!ret_address || !containing_scope) { + // Could not allocate memory + return MEMALLOC; + } + + _max_depth = max_depth; } return SUCCESS; } @@ -6877,24 +6944,24 @@ inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcep return !allocate(capacity, max_depth); } +really_inline void parser::set_max_capacity(size_t max_capacity) noexcept { + _max_capacity = max_capacity; +} + inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept { // If we don't have enough capacity, (try to) automatically bump it. // If the document was taken, reallocate that too. // Both in one if statement to minimize unlikely branching. - if (unlikely(capacity() < desired_capacity || !doc.tape)) { + if (unlikely(desired_capacity > capacity() || !doc.tape)) { if (desired_capacity > max_capacity()) { return error = CAPACITY; } - return allocate(desired_capacity, max_depth()); + return allocate(desired_capacity, _max_depth > 0 ? _max_depth : DEFAULT_MAX_DEPTH); } return SUCCESS; } -really_inline void parser::set_max_capacity(size_t max_capacity) noexcept { - _max_capacity = max_capacity; -} - } // namespace dom } // namespace simdjson From b79304d460f9b84de6e2a14be646cc2931089b14 Mon Sep 17 00:00:00 2001 From: Brendan Knapp Date: Tue, 16 Jun 2020 17:48:00 -0700 Subject: [PATCH 16/16] add vectorized versions of .deserialize_json() and .load_json(), document, add tests, and rebuild --- R/RcppExports.R | 62 ++++++- inst/tinytest/test_load_json.R | 14 ++ inst/tinytest/test_vectorized_ops.R | 82 +++++++++ man/dot-deserialize_json.Rd | 71 +++++++- src/RcppExports.cpp | 8 +- src/deserialize.cpp | 263 ++++++++++++++++++++++++---- 6 files changed, 448 insertions(+), 52 deletions(-) create mode 100644 inst/tinytest/test_load_json.R create mode 100644 inst/tinytest/test_vectorized_ops.R diff --git a/R/RcppExports.R b/R/RcppExports.R index 75710c2..4bfbfa4 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -3,7 +3,7 @@ #' Deserialize JSON into R Objects #' -#' @param json \code{character(1L)} +#' @param json \code{character()} containing one or more strings of JSON data. #' #' @param json_pointer \code{character(1L)}, default: \code{""} #' @@ -11,21 +11,69 @@ #' #' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects. #' -#' @param simplify_to default: \code{0}. Maximum simplification level. -#' 0=dataframe, 1=matrix, 2=vector, 3=list +#' @param simplify_to \code{integer(1L)}, default: \code{0L}. +#' Maximum simplification level. +#' 0: data frame, 1: matrix, 2: vector, 3: list (no simplification) #' -#' @param type_policy default: \code{0}. Level of type strictness. -#' 0=anything goes, 1=merge integers/doubles, 2=strict +#' @param type_policy \code{integer(1L)}, default: \code{0L}. +#' Level of type strictness. +#' 0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged) #' -#' @param int64_r_type default: \code{0} How to return big integers to R. -#' 0=double, 1=string, 2=bit64::integer64 +#' @param int64_r_type \code{integer(1L)} default: \code{0L} +#' How to return big integers to R. +#' 0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number +#' +#' @details +#' Instead of using \code{lapply()} for vectors containing multiple strings/file paths, +#' just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized +#' (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can +#' reuse internal buffers between parses. Since the overwhelming majority of JSON objects +#' parsed will not result in R scalars, a \code{list()} is always returned when multiple items +#' are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with +#' \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will +#' have the same names. #' #' @keywords internal #' +#' @examples +#' # .deserialize_json() ====================================================== +#' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') +#' +#' RcppSimdJson:::.deserialize_json( +#' '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +#' ) +#' +#' RcppSimdJson:::.deserialize_json( +#' c( +#' json1 = "[[1,2,3],[4,5,6]]", +#' json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +#' ) +#' ) +#' .deserialize_json <- function(json, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) { .Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) } +#' @rdname dot-deserialize_json +#' +#' @param file_path \code{character()} containing one or more paths to files containing +#' JSON data. +#' +#' @examples +#' # .load_json() ============================================================= +#' single_file <- system.file("jsonexamples", "small", "flatadversarial.json", +#' package = "RcppSimdJson") +#' RcppSimdJson:::.load_json(single_file) +#' +#' multiple_files <- vapply( +#' c("flatadversarial.json", "adversarial.json"), +#' function(.x) { +#' system.file("jsonexamples/small", .x, package = "RcppSimdJson") +#' }, +#' character(1L) +#' ) +#' RcppSimdJson:::.load_json(multiple_files) +#' .load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) { .Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) } diff --git a/inst/tinytest/test_load_json.R b/inst/tinytest/test_load_json.R new file mode 100644 index 0000000..f0262f2 --- /dev/null +++ b/inst/tinytest/test_load_json.R @@ -0,0 +1,14 @@ +if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset") + +all_files <- dir("../jsonexamples", pattern = "\\.json$", + recursive = TRUE, full.names = TRUE) + +sapply(all_files, function(.x) expect_silent(RcppSimdJson:::.load_json(.x))) + +expect_error( + RcppSimdJson:::.load_json("../jsonexamples/amazon_cellphones.ndjson") +) + +expect_error( + RcppSimdJson:::.load_json("not/a/real/file.json") +) diff --git a/inst/tinytest/test_vectorized_ops.R b/inst/tinytest/test_vectorized_ops.R new file mode 100644 index 0000000..1199a35 --- /dev/null +++ b/inst/tinytest/test_vectorized_ops.R @@ -0,0 +1,82 @@ +if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset") + +# .deserialize_json ============================================================ +test <- c( + first = '{"A":[[1,2,3],[4,5,6]]}', + second = '{"B":[{"a":1,"b":true},{"a":2,"b":false,"c":null}]}' +) + +target <- list( + first = list( + A = matrix( + c( + 1L, 2L, 3L, + 4L, 5L, 6L + ), + nrow = 2L, ncol = 3L, byrow = TRUE + ) + ), + second = list( + B = data.frame( + a = c(1L, 2L), + b = c(TRUE, FALSE), + c = c(NA, NA) + ) + ) +) + +expect_identical( + RcppSimdJson:::.deserialize_json(test), + target +) + +# confirm errors work ---------------------------------------------------------- +test <- c( + first = '{"A":[[1,2,3],[4,5,6]]}', + bad_json = '{"B":[{"a":1,"b":JUNK},{"a":2,"b":false,"c":null}]}' +) +expect_error( + RcppSimdJson:::.deserialize_json(test) +) +# .load_json() ================================================================= +test <- c( + flatadversarial.json = "../jsonexamples/small/flatadversarial.json", + adversarial.json = "../jsonexamples/small/adversarial.json" +) + +if (!all(file.exists(test))) { + exit_file( + "flatadversarial.json and/or adversarial.json are missing." + ) +} + +target <- list( + flatadversarial.json = list( + `"Name` = c("116", "\\\"", "234", "true", "FALSE"), t = 1e+10 + ), + adversarial.json = list( + `"Name rue` = structure( + c("116", "\"", "234", "true", "FALSE"), + .Dim = c(1L, 5L) + ) + ) +) + +expect_identical( + RcppSimdJson:::.load_json(test), + target +) + +# all files battery ------------------------------------------------------------ +all_files <- dir("inst/jsonexamples", pattern = "\\.json$", + recursive = TRUE, full.names = TRUE) +expect_silent( + RcppSimdJson:::.load_json(all_files) +) +# confirm errors work ---------------------------------------------------------- +expect_error( + RcppSimdJson:::.load_json(c("a/fake/file.json", all_files)) +) +expect_error( + RcppSimdJson:::.load_json(c(all_files, "another/fake/file.json")) +) diff --git a/man/dot-deserialize_json.Rd b/man/dot-deserialize_json.Rd index 72cfeed..f7ef4ac 100644 --- a/man/dot-deserialize_json.Rd +++ b/man/dot-deserialize_json.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/RcppExports.R \name{.deserialize_json} \alias{.deserialize_json} +\alias{.load_json} \title{Deserialize JSON into R Objects} \usage{ .deserialize_json( @@ -13,9 +14,19 @@ type_policy = 0L, int64_r_type = 0L ) + +.load_json( + file_path, + json_pointer = "", + empty_array = NULL, + empty_object = NULL, + simplify_to = 0L, + type_policy = 0L, + int64_r_type = 0L +) } \arguments{ -\item{json}{\code{character(1L)}} +\item{json}{\code{character()} containing one or more strings of JSON data.} \item{json_pointer}{\code{character(1L)}, default: \code{""}} @@ -23,16 +34,62 @@ \item{empty_object}{default: \code{NULL}. Any R object to return for empty JSON objects.} -\item{simplify_to}{default: \code{0}. Maximum simplification level. -0=dataframe, 1=matrix, 2=vector, 3=list} +\item{simplify_to}{\code{integer(1L)}, default: \code{0L}. +Maximum simplification level. +0: data frame, 1: matrix, 2: vector, 3: list (no simplification)} -\item{type_policy}{default: \code{0}. Level of type strictness. -0=anything goes, 1=merge integers/doubles, 2=strict} +\item{type_policy}{\code{integer(1L)}, default: \code{0L}. +Level of type strictness. +0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)} -\item{int64_r_type}{default: \code{0} How to return big integers to R. -0=double, 1=string, 2=bit64::integer64} +\item{int64_r_type}{\code{integer(1L)} default: \code{0L} +How to return big integers to R. +0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number} + +\item{file_path}{\code{character()} containing one or more paths to files containing +JSON data.} } \description{ Deserialize JSON into R Objects +} +\details{ +Instead of using \code{lapply()} for vectors containing multiple strings/file paths, + just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized + (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can + reuse internal buffers between parses. Since the overwhelming majority of JSON objects + parsed will not result in R scalars, a \code{list()} is always returned when multiple items + are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with + \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will + have the same names. +} +\examples{ +# .deserialize_json() ====================================================== +RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') + +RcppSimdJson:::.deserialize_json( + '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +) + +RcppSimdJson:::.deserialize_json( + c( + json1 = "[[1,2,3],[4,5,6]]", + json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' + ) +) + +# .load_json() ============================================================= +single_file <- system.file("jsonexamples", "small", "flatadversarial.json", + package = "RcppSimdJson") +RcppSimdJson:::.load_json(single_file) + +multiple_files <- vapply( + c("flatadversarial.json", "adversarial.json"), + function(.x) { + system.file("jsonexamples/small", .x, package = "RcppSimdJson") + }, + character(1L) +) +RcppSimdJson:::.load_json(multiple_files) + } \keyword{internal} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a149941..770e97e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,12 +6,12 @@ using namespace Rcpp; // deserialize_json -SEXP deserialize_json(const Rcpp::String& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); +SEXP deserialize_json(const Rcpp::CharacterVector& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); RcppExport SEXP _RcppSimdJson_deserialize_json(SEXP jsonSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Rcpp::String& >::type json(jsonSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type json(jsonSEXP); Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP); Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP); Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP); @@ -23,12 +23,12 @@ BEGIN_RCPP END_RCPP } // load_json -SEXP load_json(const std::string& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); +SEXP load_json(const Rcpp::CharacterVector& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); RcppExport SEXP _RcppSimdJson_load_json(SEXP file_pathSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::string& >::type file_path(file_pathSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type file_path(file_pathSEXP); Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP); Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP); Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP); diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 8d10fe0..f37f693 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -1,9 +1,74 @@ #include +SEXP deserialize_single_string(const Rcpp::CharacterVector& json, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + using namespace rcppsimdjson; + + simdjson::dom::parser parser; + auto [parsed, error] = json_pointer.empty() // + ? parser.parse(std::string(json[0])) + : parser.parse(std::string(json[0])).at(json_pointer); + if (error) { + Rcpp::stop(simdjson::error_message(error)); + } + + return deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // +} + + +SEXP deserialize_multiple_strings(const Rcpp::CharacterVector& json, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + using namespace rcppsimdjson; + + const auto n = json.length(); + Rcpp::List out(n); + simdjson::dom::parser parser; + + for (R_xlen_t i = 0; i < n; ++i) { + auto [parsed, error] = json_pointer.empty() // + ? parser.parse(std::string(json[i])) + : parser.parse(std::string(json[i])).at(json_pointer); + if (error) { + Rcpp::stop("%s\n\nValue affected:\n\t- `json[[%d]]`", simdjson::error_message(error), i + 1); + } + + out[i] = deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // + } + + out.attr("names") = json.attr("names"); + + return out; +} + + //' Deserialize JSON into R Objects //' -//' @param json \code{character(1L)} +//' @param json \code{character()} containing one or more strings of JSON data. //' //' @param json_pointer \code{character(1L)}, default: \code{""} //' @@ -11,72 +76,202 @@ //' //' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects. //' -//' @param simplify_to default: \code{0}. Maximum simplification level. -//' 0=dataframe, 1=matrix, 2=vector, 3=list +//' @param simplify_to \code{integer(1L)}, default: \code{0L}. +//' Maximum simplification level. +//' 0: data frame, 1: matrix, 2: vector, 3: list (no simplification) +//' +//' @param type_policy \code{integer(1L)}, default: \code{0L}. +//' Level of type strictness. +//' 0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged) //' -//' @param type_policy default: \code{0}. Level of type strictness. -//' 0=anything goes, 1=merge integers/doubles, 2=strict +//' @param int64_r_type \code{integer(1L)} default: \code{0L} +//' How to return big integers to R. +//' 0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number //' -//' @param int64_r_type default: \code{0} How to return big integers to R. -//' 0=double, 1=string, 2=bit64::integer64 +//' @details +//' Instead of using \code{lapply()} for vectors containing multiple strings/file paths, +//' just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized +//' (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can +//' reuse internal buffers between parses. Since the overwhelming majority of JSON objects +//' parsed will not result in R scalars, a \code{list()} is always returned when multiple items +//' are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with +//' \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will +//' have the same names. //' //' @keywords internal //' +//' @examples +//' # .deserialize_json() ====================================================== +//' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') +//' +//' RcppSimdJson:::.deserialize_json( +//' '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +//' ) +//' +//' RcppSimdJson:::.deserialize_json( +//' c( +//' json1 = "[[1,2,3],[4,5,6]]", +//' json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +//' ) +//' ) +//' // [[Rcpp::export(.deserialize_json)]] -SEXP deserialize_json(const Rcpp::String& json, +SEXP deserialize_json(const Rcpp::CharacterVector& json, const std::string& json_pointer = "", SEXP empty_array = R_NilValue, SEXP empty_object = R_NilValue, const int simplify_to = 0, const int type_policy = 0, const int int64_r_type = 0) { - using namespace rcppsimdjson; + switch (json.length()) { + case 0: + return R_NilValue; - simdjson::dom::parser parser; + case 1: + return deserialize_single_string( // + json, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // - auto [parsed, error] = json_pointer.empty() // - ? parser.parse(json) - : parser.parse(json).at(json_pointer); + default: + return deserialize_multiple_strings( // + json, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // + } + return R_NilValue; +} + + +SEXP load_single_file(const Rcpp::String& file_path, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + simdjson::dom::parser parser; + auto [parsed, error] = json_pointer.empty() // + ? parser.load(std::string(file_path)) + : parser.load(std::string(file_path)).at(json_pointer); if (error) { Rcpp::stop(simdjson::error_message(error)); } - return deserialize::deserialize(parsed, - empty_array, - empty_object, - static_cast(simplify_to), - static_cast(type_policy), - static_cast(int64_r_type)); + return rcppsimdjson::deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // +} + + +SEXP load_multiple_files(const Rcpp::CharacterVector& file_path, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + const auto n = file_path.length(); + auto out = Rcpp::List(n); + simdjson::dom::parser parser; + + for (R_xlen_t i = 0; i < n; ++i) { + auto [parsed, error] = json_pointer.empty() // + ? parser.load(std::string(file_path[i])) + : parser.load(std::string(file_path[i])).at(json_pointer); + if (error) { + Rcpp::stop("%s\n\nFile affected:\n\t- %s", // + simdjson::error_message(error), // + std::string(file_path[i])); // + } + + out[i] = rcppsimdjson::deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // + } + + out.attr("names") = file_path.attr("names"); + return out; } +//' @rdname dot-deserialize_json +//' +//' @param file_path \code{character()} containing one or more paths to files containing +//' JSON data. +//' +//' @examples +//' # .load_json() ============================================================= +//' single_file <- system.file("jsonexamples", "small", "flatadversarial.json", +//' package = "RcppSimdJson") +//' RcppSimdJson:::.load_json(single_file) +//' +//' multiple_files <- vapply( +//' c("flatadversarial.json", "adversarial.json"), +//' function(.x) { +//' system.file("jsonexamples/small", .x, package = "RcppSimdJson") +//' }, +//' character(1L) +//' ) +//' RcppSimdJson:::.load_json(multiple_files) +//' // [[Rcpp::export(.load_json)]] -SEXP load_json(const std::string& file_path, +SEXP load_json(const Rcpp::CharacterVector& file_path, const std::string& json_pointer = "", SEXP empty_array = R_NilValue, SEXP empty_object = R_NilValue, const int simplify_to = 0, const int type_policy = 0, const int int64_r_type = 0) { - using namespace rcppsimdjson; - - simdjson::dom::parser parser; + switch (file_path.length()) { + case 0: + return R_NilValue; - auto [parsed, error] = json_pointer.empty() // - ? parser.load(file_path) - : parser.load(file_path).at(json_pointer); + case 1: + return load_single_file( // + file_path[0], // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // - if (error) { - Rcpp::stop(simdjson::error_message(error)); + default: + return load_multiple_files( // + file_path, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // } - return deserialize::deserialize(parsed, - empty_array, - empty_object, - static_cast(simplify_to), - static_cast(type_policy), - static_cast(int64_r_type)); + return R_NilValue; }