diff --git a/R/RcppExports.R b/R/RcppExports.R index 4664673..4bfbfa4 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -3,7 +3,7 @@ #' Deserialize JSON into R Objects #' -#' @param json \code{character(1L)} +#' @param json \code{character()} containing one or more strings of JSON data. #' #' @param json_pointer \code{character(1L)}, default: \code{""} #' @@ -11,21 +11,73 @@ #' #' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects. #' -#' @param simplify_to default: \code{0}. Maximum simplification level. -#' 0=dataframe, 1=matrix, 2=vector, 3=list +#' @param simplify_to \code{integer(1L)}, default: \code{0L}. +#' Maximum simplification level. +#' 0: data frame, 1: matrix, 2: vector, 3: list (no simplification) #' -#' @param type_policy default: \code{0}. Level of type strictness. -#' 0=anything goes, 1=merge integers/doubles, 2=strict +#' @param type_policy \code{integer(1L)}, default: \code{0L}. +#' Level of type strictness. +#' 0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged) #' -#' @param int64_r_type default: \code{0} How to return big integers to R. -#' 0=double, 1=string, 2=bit64::integer64 +#' @param int64_r_type \code{integer(1L)} default: \code{0L} +#' How to return big integers to R. +#' 0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number +#' +#' @details +#' Instead of using \code{lapply()} for vectors containing multiple strings/file paths, +#' just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized +#' (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can +#' reuse internal buffers between parses. Since the overwhelming majority of JSON objects +#' parsed will not result in R scalars, a \code{list()} is always returned when multiple items +#' are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with +#' \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will +#' have the same names. #' #' @keywords internal #' +#' @examples +#' # .deserialize_json() ====================================================== +#' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') +#' +#' RcppSimdJson:::.deserialize_json( +#' '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +#' ) +#' +#' RcppSimdJson:::.deserialize_json( +#' c( +#' json1 = "[[1,2,3],[4,5,6]]", +#' json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +#' ) +#' ) +#' .deserialize_json <- function(json, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) { .Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) } +#' @rdname dot-deserialize_json +#' +#' @param file_path \code{character()} containing one or more paths to files containing +#' JSON data. +#' +#' @examples +#' # .load_json() ============================================================= +#' single_file <- system.file("jsonexamples", "small", "flatadversarial.json", +#' package = "RcppSimdJson") +#' RcppSimdJson:::.load_json(single_file) +#' +#' multiple_files <- vapply( +#' c("flatadversarial.json", "adversarial.json"), +#' function(.x) { +#' system.file("jsonexamples/small", .x, package = "RcppSimdJson") +#' }, +#' character(1L) +#' ) +#' RcppSimdJson:::.load_json(multiple_files) +#' +.load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) { + .Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type) +} + .exceptions_enabled <- function() { .Call(`_RcppSimdJson_exceptions_enabled`) } diff --git a/inst/include/RcppSimdJson.hpp b/inst/include/RcppSimdJson.hpp index cef0285..1c0f8b8 100644 --- a/inst/include/RcppSimdJson.hpp +++ b/inst/include/RcppSimdJson.hpp @@ -1,68 +1,6 @@ #ifndef RCPPSIMDJSON_HPP #define RCPPSIMDJSON_HPP -#define STRICT_R_HEADERS -#include - - -namespace rcppsimdjson { - -static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN; - - -enum class rcpp_T : int { - array = 0, - object = 1, - chr = 2, - u64 = 3, - dbl = 4, - i64 = 5, - i32 = 6, - lgl = 7, - null = 8, -}; - - -template static inline constexpr auto na_val() { - if constexpr (R_Type == rcpp_T::chr) { - return NA_STRING; - } - if constexpr (R_Type == rcpp_T::dbl) { - return NA_REAL; - } - if constexpr (R_Type == rcpp_T::i64) { - return NA_INTEGER64; - } - if constexpr (R_Type == rcpp_T::i32) { - return NA_INTEGER; - } - if constexpr (R_Type == rcpp_T::lgl) { - return NA_LOGICAL; - } -} - - -// #define SIMDJSON_EXCEPTIONS 0 -#ifdef SIMDJSON_EXCEPTIONS -#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS -static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1; -#else -#define RCPPSIMDJSON_EXCEPTIONS 1 -static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; -#endif - - -static inline constexpr auto is_no_except(rcpp_T R_Type) -> bool { - // all scalars seem to be extractable w/o touching throwing code except for strings - return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr; -} - - -} // namespace rcppsimdjson - -#include - -#include "RcppSimdJson/utils.hpp" #include "RcppSimdJson/deserialize.hpp" #endif diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp new file mode 100644 index 0000000..15a3a49 --- /dev/null +++ b/inst/include/RcppSimdJson/common.hpp @@ -0,0 +1,151 @@ +#ifndef RCPPSIMDJSON_COMMON_HPP +#define RCPPSIMDJSON_COMMON_HPP + + +#define STRICT_R_HEADERS +#include + + +namespace rcppsimdjson { + +/** + * @brief A container's size as an @c R_xlen_t @c. Otherwise Equivalent to @c std::size() @c. + */ +template +inline constexpr auto r_length(const _Container& __cont) noexcept -> R_xlen_t { + return static_cast(std::size(__cont)); +} + + +/** + * @brief A @c bit64::integer64 @c-compatible @c NA @c. + */ +static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN; + + +/** + * @brief Typing arguments that decide how a @c simdjson::dom::element is ultimately returned to R. + */ +enum class rcpp_T : int { + array = 0, /**< recursive: individual elements will decide ultimate R type */ + object = 1, /**< recursive: individual elements will decide ultimate R type */ + chr = 2, /**< always becomes @c Rcpp::String / @c character */ + u64 = 3, /**< always becomes @c Rcpp::String / @c character */ + dbl = 4, /**< always becomes @c double */ + i64 = 5, /**< follows @c Int64_R_Type: @c double, @c character, or @c bit64::integer64 */ + i32 = 6, /**< always becomes @c int */ + lgl = 7, /**< always becomes @c bool / @c logical */ + null = 8, /**< becomes @c NA if returned in a vector, else @c NULL */ +}; + + +/** + * @brief Get a typed @c NA @c. + */ +template static inline constexpr auto na_val() { + if constexpr (R_Type == rcpp_T::chr) { + return NA_STRING; + } + if constexpr (R_Type == rcpp_T::dbl) { + return NA_REAL; + } + if constexpr (R_Type == rcpp_T::i64) { + return NA_INTEGER64; + } + if constexpr (R_Type == rcpp_T::i32) { + return NA_INTEGER; + } + if constexpr (R_Type == rcpp_T::lgl) { + return NA_LOGICAL; + } +} + + +/** + * Internal flags tracking whether simdjson is compiled with exceptions enabled (the default). + * If simdjson is compiled w/o exceptions ( @c #define SIMDJSON_EXCEPTIONS 0 @c), operations that + * do not touch throwing code can be annotated with keyword @c noexcept where appropriate. + */ +// #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */ +#ifdef SIMDJSON_EXCEPTIONS +#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS +static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1; +#else +#define RCPPSIMDJSON_EXCEPTIONS 1 +static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false; +#endif + + +/** + * @brief Whether a function is @c noexcept. + * + * If a function does not touch throwing code it can be annotated with @c noexcept(). + * If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not + * @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept + * when compiled. + * + * Currently, @c rccp_T::chr touches throwing code so functions using it will always be + * @c noexcept(false). + * + * Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}. + */ +static inline constexpr auto is_no_except(rcpp_T R_Type) // NOLINT(clang-diagnostic-unused-function) + -> bool { + return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr; +} + + +namespace deserialize { + + +/** + * @brief Determines level of type strictness in combining array elements into R vectors. + * + * When arrays are not homogeneous and @c Type_Policy::anything_goes is used, type promotion follows + * R's behavior. + */ +enum class Type_Policy : int { + anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */ + ints_as_dbls = 1, /* Non-recursive arrays of only numbers are promoted to highest type */ + strict = 2, /* No type promotion */ +}; + + +/** + * @brief Maximum simplification level. + */ +enum class Simplify_To : int { + data_frame = 0, /* If possible, return dataframes. Otherwise return matrices/vectors/lists. */ + matrix = 1, /* If possible, return matrices. Otherwise return vectors/lists. */ + vector = 2, /* If possible, return vectors. Otherwise return lists. */ + list = 3, /* No simplification. */ +}; + + +} // namespace deserialize +} // namespace rcppsimdjson + + +#include "../simdjson.h" +#include "utils.hpp" + + +namespace rcppsimdjson { +namespace deserialize { + + +/** + * @brief Simplify a @c simdjson::dom::element to an R object. + * + * @note Forward declaration. See @file inst/include/RcppSimdJson/deserialize/simplify.hpp @file. + */ +template +inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object) + -> SEXP; + + +} // namespace deserialize +} // namespace rcppsimdjson + + +#endif \ No newline at end of file diff --git a/inst/include/RcppSimdJson/deserialize.hpp b/inst/include/RcppSimdJson/deserialize.hpp index edcd970..c990bf9 100644 --- a/inst/include/RcppSimdJson/deserialize.hpp +++ b/inst/include/RcppSimdJson/deserialize.hpp @@ -1,20 +1,43 @@ #ifndef RCPPSIMDJSON__DESERIALIZE_HPP #define RCPPSIMDJSON__DESERIALIZE_HPP + #include "deserialize/simplify.hpp" + namespace rcppsimdjson { namespace deserialize { -// THE GREAT DISPATCHER + +/** + * @brief Deserialize a parsed @c simdjson::dom::element to R objects. + * + * + * @param element @c simdjson::dom::element to deserialize. + * + * @param empty_array R object to return when encountering an empty JSON array. + * + * @param empty_object R object to return when encountering an empty JSON object. + * + * @param type_policy @c Type_Policy specifying type strictness in combining mixed-type array + * elements into R vectors. + * + * @param int64_opt @c Int64_R_Type specifying how big integers are returned to R. + * + * @param simplify_to @c Simplify_To specifying the maximum level of simplification. + * + * + * @return The simplified R object ( @c SEXP ). + */ inline auto deserialize(const simdjson::dom::element parsed, - const SEXP empty_array, - const SEXP empty_object, + SEXP empty_array, + SEXP empty_object, const Simplify_To simplify_to, const Type_Policy type_policy, const utils::Int64_R_Type int64_opt) -> SEXP { using Int64_R_Type = utils::Int64_R_Type; + // THE GREAT DISPATCHER switch (type_policy) { case Type_Policy::anything_goes: { switch (int64_opt) { @@ -248,4 +271,5 @@ inline auto deserialize(const simdjson::dom::element parsed, } // namespace deserialize } // namespace rcppsimdjson + #endif diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp index e272e20..70bfacc 100644 --- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp +++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp @@ -1,18 +1,13 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP #define RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP +#include "../common.hpp" + namespace rcppsimdjson { namespace deserialize { -enum class Type_Policy : int { - anything_goes = 0, - ints_as_dbls = 1, - strict = 2, -}; - - template class Type_Doctor { bool ARRAY_ = false; bool array_ = false; @@ -39,6 +34,7 @@ template class Type_Doctor { bool UINT64_ = false; bool u64_ = false; + public: Type_Doctor() = default; explicit Type_Doctor(simdjson::dom::array) noexcept; @@ -97,7 +93,7 @@ template class Type_Doctor { auto add_element(simdjson::dom::element) noexcept -> void; - constexpr auto update(Type_Doctor&& type_doctor) noexcept -> void; + constexpr auto update(Type_Doctor&&) noexcept -> void; }; @@ -226,11 +222,11 @@ inline constexpr auto Type_Doctor::common_R_type() co if (chr_ && !(dbl_ || i64_ || i32_ || lgl_ || u64_)) { return rcpp_T::chr; } - + if (dbl_ && !(lgl_ || u64_)) { // any number will become double return rcpp_T::dbl; } - if (i64_ && !(i32_ || lgl_ || u64_)) { + if (i64_ && !(lgl_ || u64_)) { // only 64/32-bit integers: will follow selected Int64_R_Type option return rcpp_T::i64; } @@ -325,7 +321,7 @@ template inline constexpr auto Type_Doctor::common_element_type() const noexcept -> simdjson::dom::element_type { - using namespace simdjson::dom; + using simdjson::dom::element_type; return ARRAY_ ? element_type::ARRAY : OBJECT_ ? element_type::OBJECT diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp index 0a8a0e2..0efa7c4 100644 --- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp +++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp @@ -1,6 +1,7 @@ #ifndef RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP #define RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP + #include "matrix.hpp" @@ -8,7 +9,7 @@ namespace rcppsimdjson { namespace deserialize { template struct Column { - R_xlen_t index = 0; + R_xlen_t index = 0L; Type_Doctor schema = Type_Doctor(); }; @@ -24,7 +25,7 @@ diagnose_data_frame(const simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_E -> std::optional> { auto cols = Column_Schema(); - auto col_index = 0; + auto col_index = R_xlen_t(0L); if (std::size(array) == 0) { return std::nullopt; @@ -59,7 +60,7 @@ inline auto build_col(const simdjson::dom::array array, const Type_Doctor& type_doc) -> Rcpp::Vector { auto out = Rcpp::Vector(std::size(array), na_val()); - auto i_row = R_xlen_t(0); + auto i_row = R_xlen_t(0L); if (type_doc.is_homogeneous()) { if (type_doc.has_null()) { @@ -112,7 +113,7 @@ inline auto build_col_integer64(const simdjson::dom::array array, if constexpr (int64_opt == utils::Int64_R_Type::Integer64) { auto stl_vec = std::vector(std::size(array), NA_INTEGER64); - auto i_row = std::size_t(0); + auto i_row = std::size_t(0ULL); if (type_doc.is_homogeneous()) { if (type_doc.has_null()) { @@ -159,6 +160,79 @@ inline auto build_col_integer64(const simdjson::dom::array array, } +template +inline auto build_data_frame(const simdjson::dom::array array, + const std::map>& cols, + SEXP empty_array, + SEXP empty_object) -> SEXP { + + const auto n_rows = R_xlen_t(std::size(array)); + auto out = Rcpp::List(std::size(cols)); + auto out_names = Rcpp::CharacterVector(std::size(cols)); + + for (auto [key, col] : cols) { + out_names[col.index] = std::string(key); + + switch (col.schema.common_R_type()) { + case rcpp_T::chr: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::dbl: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::i64: { + out[col.index] = build_col_integer64(array, key, col.schema); + break; + } + + case rcpp_T::i32: { + out[col.index] = + build_col(array, key, col.schema); + break; + } + + case rcpp_T::lgl: { + out[col.index] = build_col(array, key, col.schema); + break; + } + + case rcpp_T::null: { + out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL); + break; + } + + default: { + auto this_col = Rcpp::Vector(n_rows); + auto i_row = R_xlen_t(0L); + for (auto element : array) { + auto [value, error] = element.get().at_key(key); + if (error) { + this_col[i_row++] = NA_LOGICAL; + } else { + this_col[i_row++] = simplify_element( + value, empty_array, empty_object // + ); + } + } + out[col.index] = this_col; + } + } + } + + out.attr("names") = out_names; + out.attr("row.names") = Rcpp::seq(1, n_rows); + out.attr("class") = "data.frame"; + + return out; +} + + } // namespace deserialize } // namespace rcppsimdjson diff --git a/inst/include/RcppSimdJson/deserialize/matrix.hpp b/inst/include/RcppSimdJson/deserialize/matrix.hpp index 074ca34..edd173f 100644 --- a/inst/include/RcppSimdJson/deserialize/matrix.hpp +++ b/inst/include/RcppSimdJson/deserialize/matrix.hpp @@ -47,23 +47,13 @@ inline auto diagnose(simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_EXCEPT template -inline auto build_matrix_typed(simdjson::dom::array array, const std::size_t n_cols) +inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols) -> Rcpp::Vector { - const auto n_rows = std::size(array); - auto out = Rcpp::Matrix(n_rows, n_cols); + const auto n_rows = r_length(array); + auto out = Rcpp::Matrix(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = R_xlen_t(0); - for (auto element : sub_array) { - out[i + j] = get_scalar(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = R_xlen_t(0); for (auto element : sub_array.get().first) { @@ -72,29 +62,18 @@ inline auto build_matrix_typed(simdjson::dom::array array, const std::size_t n_c } j++; } -#endif return out; } template -inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std::size_t n_cols) +inline auto build_matrix_integer64_typed(simdjson::dom::array array, std::size_t n_cols) -> Rcpp::Vector { const auto n_rows = std::size(array); auto stl_vec_int64 = std::vector(n_rows * n_cols); auto j = std::size_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = std::size_t(0); - for (auto element : sub_array) { - stl_vec_int64[i + j] = get_scalar(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = std::size_t(0); for (auto element : sub_array.get().first) { @@ -103,10 +82,10 @@ inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std:: } j++; } -#endif auto out = Rcpp::NumericVector(utils::as_integer64(stl_vec_int64)); - out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols); + out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // + static_cast(n_cols)); return out; } @@ -159,7 +138,7 @@ inline auto dispatch_typed(const simdjson::dom::array array, : build_matrix_typed(array, n_cols); case simdjson::dom::element_type::NULL_VALUE: - return Rcpp::LogicalVector(std::size(array), NA_LOGICAL); + return Rcpp::LogicalVector(r_length(array), NA_LOGICAL); case simdjson::dom::element_type::UINT64: return has_nulls ? build_matrix_typed(array, n_cols) @@ -168,26 +147,16 @@ inline auto dispatch_typed(const simdjson::dom::array array, default: return R_NilValue; } -} // namespace deserialize +} template -inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size_t n_cols) -> SEXP { +inline auto build_matrix_mixed(simdjson::dom::array array, std::size_t n_cols) -> SEXP { - const auto n_rows = std::size(array); - Rcpp::Matrix out(n_rows, n_cols); + const auto n_rows = r_length(array); + Rcpp::Matrix out(n_rows, static_cast(n_cols)); auto j = R_xlen_t(0); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - auto i = R_xlen_t(0); - for (auto element : sub_array) { - out[i + j] = get_scalar_dispatch(element); - i += n_rows; - } - j++; - } -#else for (auto sub_array : array) { auto i = R_xlen_t(0); for (auto element : sub_array.get().first) { @@ -196,7 +165,6 @@ inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size } j++; } -#endif return out; } @@ -207,29 +175,8 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: const auto n_rows = std::size(array); auto stl_vec_int64 = std::vector(n_rows * n_cols); - auto j = std::size_t(0); + auto j = std::size_t(0ULL); -#if RCPPSIMDJSON_EXCEPTIONS - for (simdjson::dom::array sub_array : array) { - std::size_t i = 0; - for (auto element : sub_array) { - switch (element.type()) { - case simdjson::dom::element_type::INT64: - stl_vec_int64[i + j] = get_scalar(element); - break; - - case simdjson::dom::element_type::BOOL: - stl_vec_int64[i + j] = get_scalar(element); - break; - - default: - stl_vec_int64[i + j] = NA_INTEGER64; - } - i += n_rows; - } - j++; - } -#else for (auto element : array) { std::size_t i = 0; for (auto sub_element : element.get().first) { @@ -249,11 +196,10 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std:: } j++; } -#endif - auto out = Rcpp::Vector(utils::as_integer64(stl_vec_int64)); - out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols); + out.attr("dim") = Rcpp::IntegerVector::create(static_cast(n_rows), // + static_cast(n_cols)); return out; } @@ -294,7 +240,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array, return build_matrix_mixed(array, n_cols); default: { - auto out = Rcpp::LogicalMatrix(std::size(array), n_cols); + auto out = Rcpp::LogicalMatrix(r_length(array), static_cast(n_cols)); out.fill(NA_LOGICAL); return out; } diff --git a/inst/include/RcppSimdJson/deserialize/scalar.hpp b/inst/include/RcppSimdJson/deserialize/scalar.hpp index 258478b..d76c04e 100644 --- a/inst/include/RcppSimdJson/deserialize/scalar.hpp +++ b/inst/include/RcppSimdJson/deserialize/scalar.hpp @@ -7,6 +7,16 @@ namespace rcppsimdjson { namespace deserialize { +/* + * Check for `null`s and return the appropriate `NA`s when found. + */ +static inline constexpr bool HAS_NULLS = true; +/* + * No `null`s present, so skip checking for them. + */ +static inline constexpr bool NO_NULLS = false; + + template inline auto get_scalar_(simdjson::dom::element) noexcept(is_no_except(R_Type)); @@ -63,7 +73,7 @@ inline auto get_scalar_(simdjson::dom::element element) no template <> inline auto get_scalar_(simdjson::dom::element element) noexcept( is_no_except(rcpp_T::dbl)) { - return static_cast(element.get().first); + return element.get().first; } // return int64_t template <> diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp index 096ef61..3107c64 100644 --- a/inst/include/RcppSimdJson/deserialize/simplify.hpp +++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp @@ -2,118 +2,22 @@ #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP -namespace rcppsimdjson { -namespace deserialize { - - -static inline constexpr bool HAS_NULLS = true; -static inline constexpr bool NO_NULLS = false; - -} // namespace deserialize -} // namespace rcppsimdjson - - +#include "../common.hpp" #include "Type_Doctor.hpp" #include "scalar.hpp" #include "vector.hpp" #include "matrix.hpp" #include "dataframe.hpp" + namespace rcppsimdjson { namespace deserialize { -enum class Simplify_To : int { - data_frame = 0, - matrix = 1, - vector = 2, - list = 3, -}; - - -// forward declaration template -inline auto simplify_element(const simdjson::dom::element, const SEXP, const SEXP) -> SEXP; - - -template -inline auto build_data_frame(const simdjson::dom::array array, - const std::map>& cols, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { - - const auto n_rows = R_xlen_t(std::size(array)); - auto out = Rcpp::List(std::size(cols)); - auto out_names = Rcpp::CharacterVector(std::size(cols)); - - for (auto [key, col] : cols) { - out_names[col.index] = std::string(key); - - switch (col.schema.common_R_type()) { - case rcpp_T::chr: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::dbl: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::i64: { - out[col.index] = build_col_integer64(array, key, col.schema); - break; - } - - case rcpp_T::i32: { - out[col.index] = - build_col(array, key, col.schema); - break; - } - - case rcpp_T::lgl: { - out[col.index] = build_col(array, key, col.schema); - break; - } - - case rcpp_T::null: { - out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL); - break; - } - - default: { - auto this_col = Rcpp::Vector(n_rows); - auto i_row = R_xlen_t(0); - for (auto element : array) { - auto [value, error] = element.get().at_key(key); - if (error) { - this_col[i_row++] = NA_LOGICAL; - } else { - this_col[i_row++] = simplify_element( - value, empty_array, empty_object // - ); - } - } - out[col.index] = this_col; - } - } - } - - out.attr("names") = out_names; - out.attr("row.names") = Rcpp::seq(1, n_rows); - out.attr("class") = "data.frame"; - - return out; -} - - -template -inline auto simplify_list(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { - Rcpp::List out(std::size(array)); +inline auto simplify_list(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { + Rcpp::List out(r_length(array)); auto i = R_xlen_t(0); for (auto element : array) { @@ -129,9 +33,8 @@ inline auto simplify_list(const simdjson::dom::array array, template -inline auto simplify_vector(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_vector(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { const auto type_doctor = Type_Doctor(array); if (type_doctor.is_vectorizable()) { @@ -149,9 +52,8 @@ inline auto simplify_vector(const simdjson::dom::array array, template -inline auto simplify_matrix(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto simplify_matrix(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) + -> SEXP { if (const auto matrix = matrix::diagnose(array)) { return matrix->is_homogeneous ? matrix::dispatch_typed( // @@ -169,9 +71,8 @@ inline auto simplify_matrix(const simdjson::dom::array array, template -inline auto simplify_data_frame(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { +inline auto +simplify_data_frame(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) -> SEXP { if (const auto cols = diagnose_data_frame(array)) { return build_data_frame( // array, // @@ -187,8 +88,8 @@ inline auto simplify_data_frame(const simdjson::dom::array array, template inline auto dispatch_simplify_array(const simdjson::dom::array array, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { + SEXP empty_array, + SEXP empty_object) -> SEXP { if (std::size(array) == 0) { return empty_array; @@ -229,10 +130,9 @@ inline auto dispatch_simplify_array(const simdjson::dom::array array, template -inline auto simplify_object(const simdjson::dom::object object, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { - const auto n = R_xlen_t(std::size(object)); +inline auto simplify_object(const simdjson::dom::object object, SEXP empty_array, SEXP empty_object) + -> SEXP { + const auto n = r_length(object); if (n == 0) { return empty_object; } @@ -240,7 +140,7 @@ inline auto simplify_object(const simdjson::dom::object object, Rcpp::List out(n); Rcpp::CharacterVector out_names(n); - auto i = R_xlen_t(0); + auto i = R_xlen_t(0L); for (auto [key, value] : object) { out[i] = simplify_element(value, empty_array, empty_object); @@ -252,10 +152,34 @@ inline auto simplify_object(const simdjson::dom::object object, } +/** + * @brief Simplify a @c simdjson::dom::element to an R object. + * + * + * @tparam type_policy The @c Type_Policy specifying type strictness in combining mixed-type array + * elements into R vectors. + * + * @tparam int64_opt The @c Int64_R_Type specifying how big integers are returned to R. + * + * @tparam simplify_to The @c Simplify_To specifying the maximum level of simplification. + * + * + * @param element @c simdjson::dom::element to simplify. + * + * @param empty_array R object to return when encountering an empty JSON array. + * + * @param empty_object R object to return when encountering an empty JSON object. + * + * + * @return The simplified R object ( @c SEXP ). + * + * + * @note definition: forward declaration in @file inst/include/RcppSimdJson/common.hpp @file. + */ template inline auto simplify_element(const simdjson::dom::element element, - const SEXP empty_array, - const SEXP empty_object) -> SEXP { + SEXP empty_array, + SEXP empty_object) -> SEXP { switch (element.type()) { case simdjson::dom::element_type::ARRAY: @@ -273,13 +197,13 @@ inline auto simplify_element(const simdjson::dom::element element, ); case simdjson::dom::element_type::DOUBLE: - return Rcpp::wrap(element.get().first); + return Rcpp::wrap(element.get().first); case simdjson::dom::element_type::INT64: return utils::resolve_int64(element.get().first); case simdjson::dom::element_type::BOOL: - return Rcpp::wrap(element.get().first); + return Rcpp::wrap(element.get().first); case simdjson::dom::element_type::STRING: return Rcpp::wrap(element.get().first); diff --git a/inst/include/RcppSimdJson/deserialize/vector.hpp b/inst/include/RcppSimdJson/deserialize/vector.hpp index e747af9..7771044 100644 --- a/inst/include/RcppSimdJson/deserialize/vector.hpp +++ b/inst/include/RcppSimdJson/deserialize/vector.hpp @@ -9,7 +9,7 @@ namespace vector { template inline auto build_vector_typed(const simdjson::dom::array array) -> Rcpp::Vector { - auto out = Rcpp::Vector(std::size(array)); + auto out = Rcpp::Vector(r_length(array)); auto i = R_xlen_t(0); for (auto element : array) { out[i++] = get_scalar(element); @@ -23,7 +23,7 @@ inline auto build_vector_integer64_typed(const simdjson::dom::array array) -> Rcpp::Vector { auto stl_vec_int64 = std::vector(std::size(array)); - auto i = std::size_t(0); + auto i = std::size_t(0ULL); for (auto element : array) { stl_vec_int64[i++] = get_scalar(element); } @@ -82,8 +82,8 @@ inline auto dispatch_typed(const simdjson::dom::array array, template inline auto build_vector_mixed(const simdjson::dom::array array) -> Rcpp::Vector { - auto out = Rcpp::Vector(std::size(array)); - auto i = R_xlen_t(0); + auto out = Rcpp::Vector(r_length(array)); + auto i = R_xlen_t(0L); for (auto element : array) { out[i++] = get_scalar_dispatch(element); } @@ -96,7 +96,7 @@ inline auto build_vector_integer64_mixed(const simdjson::dom::array array) -> Rcpp::Vector { auto stl_vec_int64 = std::vector(std::size(array)); - auto i = std::size_t(0); + auto i = std::size_t(0ULL); for (auto element : array) { switch (element.type()) { case simdjson::dom::element_type::INT64: @@ -151,7 +151,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array, const rcpp_T common return build_vector_mixed(array); default: - return Rcpp::LogicalVector(std::size(array), NA_LOGICAL); + return Rcpp::LogicalVector(r_length(array), NA_LOGICAL); } } diff --git a/inst/tinytest/test_load_json.R b/inst/tinytest/test_load_json.R new file mode 100644 index 0000000..f0262f2 --- /dev/null +++ b/inst/tinytest/test_load_json.R @@ -0,0 +1,14 @@ +if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset") + +all_files <- dir("../jsonexamples", pattern = "\\.json$", + recursive = TRUE, full.names = TRUE) + +sapply(all_files, function(.x) expect_silent(RcppSimdJson:::.load_json(.x))) + +expect_error( + RcppSimdJson:::.load_json("../jsonexamples/amazon_cellphones.ndjson") +) + +expect_error( + RcppSimdJson:::.load_json("not/a/real/file.json") +) diff --git a/inst/tinytest/test_vectorized_ops.R b/inst/tinytest/test_vectorized_ops.R new file mode 100644 index 0000000..1199a35 --- /dev/null +++ b/inst/tinytest/test_vectorized_ops.R @@ -0,0 +1,82 @@ +if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset") + +# .deserialize_json ============================================================ +test <- c( + first = '{"A":[[1,2,3],[4,5,6]]}', + second = '{"B":[{"a":1,"b":true},{"a":2,"b":false,"c":null}]}' +) + +target <- list( + first = list( + A = matrix( + c( + 1L, 2L, 3L, + 4L, 5L, 6L + ), + nrow = 2L, ncol = 3L, byrow = TRUE + ) + ), + second = list( + B = data.frame( + a = c(1L, 2L), + b = c(TRUE, FALSE), + c = c(NA, NA) + ) + ) +) + +expect_identical( + RcppSimdJson:::.deserialize_json(test), + target +) + +# confirm errors work ---------------------------------------------------------- +test <- c( + first = '{"A":[[1,2,3],[4,5,6]]}', + bad_json = '{"B":[{"a":1,"b":JUNK},{"a":2,"b":false,"c":null}]}' +) +expect_error( + RcppSimdJson:::.deserialize_json(test) +) +# .load_json() ================================================================= +test <- c( + flatadversarial.json = "../jsonexamples/small/flatadversarial.json", + adversarial.json = "../jsonexamples/small/adversarial.json" +) + +if (!all(file.exists(test))) { + exit_file( + "flatadversarial.json and/or adversarial.json are missing." + ) +} + +target <- list( + flatadversarial.json = list( + `"Name` = c("116", "\\\"", "234", "true", "FALSE"), t = 1e+10 + ), + adversarial.json = list( + `"Name rue` = structure( + c("116", "\"", "234", "true", "FALSE"), + .Dim = c(1L, 5L) + ) + ) +) + +expect_identical( + RcppSimdJson:::.load_json(test), + target +) + +# all files battery ------------------------------------------------------------ +all_files <- dir("inst/jsonexamples", pattern = "\\.json$", + recursive = TRUE, full.names = TRUE) +expect_silent( + RcppSimdJson:::.load_json(all_files) +) +# confirm errors work ---------------------------------------------------------- +expect_error( + RcppSimdJson:::.load_json(c("a/fake/file.json", all_files)) +) +expect_error( + RcppSimdJson:::.load_json(c(all_files, "another/fake/file.json")) +) diff --git a/man/dot-deserialize_json.Rd b/man/dot-deserialize_json.Rd index 72cfeed..f7ef4ac 100644 --- a/man/dot-deserialize_json.Rd +++ b/man/dot-deserialize_json.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/RcppExports.R \name{.deserialize_json} \alias{.deserialize_json} +\alias{.load_json} \title{Deserialize JSON into R Objects} \usage{ .deserialize_json( @@ -13,9 +14,19 @@ type_policy = 0L, int64_r_type = 0L ) + +.load_json( + file_path, + json_pointer = "", + empty_array = NULL, + empty_object = NULL, + simplify_to = 0L, + type_policy = 0L, + int64_r_type = 0L +) } \arguments{ -\item{json}{\code{character(1L)}} +\item{json}{\code{character()} containing one or more strings of JSON data.} \item{json_pointer}{\code{character(1L)}, default: \code{""}} @@ -23,16 +34,62 @@ \item{empty_object}{default: \code{NULL}. Any R object to return for empty JSON objects.} -\item{simplify_to}{default: \code{0}. Maximum simplification level. -0=dataframe, 1=matrix, 2=vector, 3=list} +\item{simplify_to}{\code{integer(1L)}, default: \code{0L}. +Maximum simplification level. +0: data frame, 1: matrix, 2: vector, 3: list (no simplification)} -\item{type_policy}{default: \code{0}. Level of type strictness. -0=anything goes, 1=merge integers/doubles, 2=strict} +\item{type_policy}{\code{integer(1L)}, default: \code{0L}. +Level of type strictness. +0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)} -\item{int64_r_type}{default: \code{0} How to return big integers to R. -0=double, 1=string, 2=bit64::integer64} +\item{int64_r_type}{\code{integer(1L)} default: \code{0L} +How to return big integers to R. +0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number} + +\item{file_path}{\code{character()} containing one or more paths to files containing +JSON data.} } \description{ Deserialize JSON into R Objects +} +\details{ +Instead of using \code{lapply()} for vectors containing multiple strings/file paths, + just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized + (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can + reuse internal buffers between parses. Since the overwhelming majority of JSON objects + parsed will not result in R scalars, a \code{list()} is always returned when multiple items + are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with + \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will + have the same names. +} +\examples{ +# .deserialize_json() ====================================================== +RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') + +RcppSimdJson:::.deserialize_json( + '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +) + +RcppSimdJson:::.deserialize_json( + c( + json1 = "[[1,2,3],[4,5,6]]", + json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' + ) +) + +# .load_json() ============================================================= +single_file <- system.file("jsonexamples", "small", "flatadversarial.json", + package = "RcppSimdJson") +RcppSimdJson:::.load_json(single_file) + +multiple_files <- vapply( + c("flatadversarial.json", "adversarial.json"), + function(.x) { + system.file("jsonexamples/small", .x, package = "RcppSimdJson") + }, + character(1L) +) +RcppSimdJson:::.load_json(multiple_files) + } \keyword{internal} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index df640d0..770e97e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,12 +6,12 @@ using namespace Rcpp; // deserialize_json -SEXP deserialize_json(const Rcpp::String& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); +SEXP deserialize_json(const Rcpp::CharacterVector& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); RcppExport SEXP _RcppSimdJson_deserialize_json(SEXP jsonSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Rcpp::String& >::type json(jsonSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type json(jsonSEXP); Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP); Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP); Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP); @@ -22,6 +22,23 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// load_json +SEXP load_json(const Rcpp::CharacterVector& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type); +RcppExport SEXP _RcppSimdJson_load_json(SEXP file_pathSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type file_path(file_pathSEXP); + Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP); + Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP); + Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP); + Rcpp::traits::input_parameter< const int >::type simplify_to(simplify_toSEXP); + Rcpp::traits::input_parameter< const int >::type type_policy(type_policySEXP); + Rcpp::traits::input_parameter< const int >::type int64_r_type(int64_r_typeSEXP); + rcpp_result_gen = Rcpp::wrap(load_json(file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)); + return rcpp_result_gen; +END_RCPP +} // exceptions_enabled bool exceptions_enabled(); RcppExport SEXP _RcppSimdJson_exceptions_enabled() { @@ -85,6 +102,7 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_RcppSimdJson_deserialize_json", (DL_FUNC) &_RcppSimdJson_deserialize_json, 7}, + {"_RcppSimdJson_load_json", (DL_FUNC) &_RcppSimdJson_load_json, 7}, {"_RcppSimdJson_exceptions_enabled", (DL_FUNC) &_RcppSimdJson_exceptions_enabled, 0}, {"_RcppSimdJson_check_int64", (DL_FUNC) &_RcppSimdJson_check_int64, 0}, {"_RcppSimdJson_validateJSON", (DL_FUNC) &_RcppSimdJson_validateJSON, 1}, diff --git a/src/deserialize.cpp b/src/deserialize.cpp index 6d81721..f37f693 100644 --- a/src/deserialize.cpp +++ b/src/deserialize.cpp @@ -1,8 +1,74 @@ #include + +SEXP deserialize_single_string(const Rcpp::CharacterVector& json, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + using namespace rcppsimdjson; + + simdjson::dom::parser parser; + auto [parsed, error] = json_pointer.empty() // + ? parser.parse(std::string(json[0])) + : parser.parse(std::string(json[0])).at(json_pointer); + if (error) { + Rcpp::stop(simdjson::error_message(error)); + } + + return deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // +} + + +SEXP deserialize_multiple_strings(const Rcpp::CharacterVector& json, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + using namespace rcppsimdjson; + + const auto n = json.length(); + Rcpp::List out(n); + simdjson::dom::parser parser; + + for (R_xlen_t i = 0; i < n; ++i) { + auto [parsed, error] = json_pointer.empty() // + ? parser.parse(std::string(json[i])) + : parser.parse(std::string(json[i])).at(json_pointer); + if (error) { + Rcpp::stop("%s\n\nValue affected:\n\t- `json[[%d]]`", simdjson::error_message(error), i + 1); + } + + out[i] = deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // + } + + out.attr("names") = json.attr("names"); + + return out; +} + + //' Deserialize JSON into R Objects //' -//' @param json \code{character(1L)} +//' @param json \code{character()} containing one or more strings of JSON data. //' //' @param json_pointer \code{character(1L)}, default: \code{""} //' @@ -10,48 +76,202 @@ //' //' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects. //' -//' @param simplify_to default: \code{0}. Maximum simplification level. -//' 0=dataframe, 1=matrix, 2=vector, 3=list +//' @param simplify_to \code{integer(1L)}, default: \code{0L}. +//' Maximum simplification level. +//' 0: data frame, 1: matrix, 2: vector, 3: list (no simplification) //' -//' @param type_policy default: \code{0}. Level of type strictness. -//' 0=anything goes, 1=merge integers/doubles, 2=strict +//' @param type_policy \code{integer(1L)}, default: \code{0L}. +//' Level of type strictness. +//' 0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged) //' -//' @param int64_r_type default: \code{0} How to return big integers to R. -//' 0=double, 1=string, 2=bit64::integer64 +//' @param int64_r_type \code{integer(1L)} default: \code{0L} +//' How to return big integers to R. +//' 0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number +//' +//' @details +//' Instead of using \code{lapply()} for vectors containing multiple strings/file paths, +//' just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized +//' (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can +//' reuse internal buffers between parses. Since the overwhelming majority of JSON objects +//' parsed will not result in R scalars, a \code{list()} is always returned when multiple items +//' are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with +//' \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will +//' have the same names. //' //' @keywords internal //' +//' @examples +//' # .deserialize_json() ====================================================== +//' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]') +//' +//' RcppSimdJson:::.deserialize_json( +//' '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +//' ) +//' +//' RcppSimdJson:::.deserialize_json( +//' c( +//' json1 = "[[1,2,3],[4,5,6]]", +//' json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]' +//' ) +//' ) +//' // [[Rcpp::export(.deserialize_json)]] -SEXP deserialize_json(const Rcpp::String& json, +SEXP deserialize_json(const Rcpp::CharacterVector& json, const std::string& json_pointer = "", SEXP empty_array = R_NilValue, SEXP empty_object = R_NilValue, const int simplify_to = 0, const int type_policy = 0, const int int64_r_type = 0) { - using namespace rcppsimdjson; + switch (json.length()) { + case 0: + return R_NilValue; - simdjson::dom::parser parser; + case 1: + return deserialize_single_string( // + json, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // + + default: + return deserialize_multiple_strings( // + json, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // + } -#if RCPPSIMDJSON_EXCEPTIONS - simdjson::dom::element parsed = json_pointer.empty() // - ? parser.parse(json) - : parser.parse(json).at(json_pointer); -#else + return R_NilValue; +} + + +SEXP load_single_file(const Rcpp::String& file_path, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + simdjson::dom::parser parser; auto [parsed, error] = json_pointer.empty() // - ? parser.parse(json).first - : parser.parse(json).at(json_pointer); + ? parser.load(std::string(file_path)) + : parser.load(std::string(file_path)).at(json_pointer); if (error) { Rcpp::stop(simdjson::error_message(error)); } -#endif - return deserialize::deserialize(parsed, - empty_array, - empty_object, - static_cast(simplify_to), - static_cast(type_policy), - static_cast(int64_r_type)); + return rcppsimdjson::deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // +} + + +SEXP load_multiple_files(const Rcpp::CharacterVector& file_path, + const std::string& json_pointer, + SEXP empty_array, + SEXP empty_object, + const rcppsimdjson::deserialize::Simplify_To simplify_to, + const rcppsimdjson::deserialize::Type_Policy type_policy, + const rcppsimdjson::utils::Int64_R_Type int64_r_type) { + const auto n = file_path.length(); + auto out = Rcpp::List(n); + simdjson::dom::parser parser; + + for (R_xlen_t i = 0; i < n; ++i) { + auto [parsed, error] = json_pointer.empty() // + ? parser.load(std::string(file_path[i])) + : parser.load(std::string(file_path[i])).at(json_pointer); + if (error) { + Rcpp::stop("%s\n\nFile affected:\n\t- %s", // + simdjson::error_message(error), // + std::string(file_path[i])); // + } + + out[i] = rcppsimdjson::deserialize::deserialize( // + parsed, // + empty_array, // + empty_object, // + simplify_to, // + type_policy, // + int64_r_type // + ); // + } + + out.attr("names") = file_path.attr("names"); + return out; +} + + +//' @rdname dot-deserialize_json +//' +//' @param file_path \code{character()} containing one or more paths to files containing +//' JSON data. +//' +//' @examples +//' # .load_json() ============================================================= +//' single_file <- system.file("jsonexamples", "small", "flatadversarial.json", +//' package = "RcppSimdJson") +//' RcppSimdJson:::.load_json(single_file) +//' +//' multiple_files <- vapply( +//' c("flatadversarial.json", "adversarial.json"), +//' function(.x) { +//' system.file("jsonexamples/small", .x, package = "RcppSimdJson") +//' }, +//' character(1L) +//' ) +//' RcppSimdJson:::.load_json(multiple_files) +//' +// [[Rcpp::export(.load_json)]] +SEXP load_json(const Rcpp::CharacterVector& file_path, + const std::string& json_pointer = "", + SEXP empty_array = R_NilValue, + SEXP empty_object = R_NilValue, + const int simplify_to = 0, + const int type_policy = 0, + const int int64_r_type = 0) { + switch (file_path.length()) { + case 0: + return R_NilValue; + + case 1: + return load_single_file( // + file_path[0], // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // + + default: + return load_multiple_files( // + file_path, // + json_pointer, // + empty_array, // + empty_object, // + static_cast(simplify_to), // + static_cast(type_policy), // + static_cast(int64_r_type) // + ); // + } + + return R_NilValue; }