Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/deserialize #20

Merged
merged 16 commits into from
Jun 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 59 additions & 7 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,81 @@

#' Deserialize JSON into R Objects
#'
#' @param json \code{character(1L)}
#' @param json \code{character()} containing one or more strings of JSON data.
#'
#' @param json_pointer \code{character(1L)}, default: \code{""}
#'
#' @param empty_array default: \code{NULL}. Any R object to return for empty JSON arrays.
#'
#' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects.
#'
#' @param simplify_to default: \code{0}. Maximum simplification level.
#' 0=dataframe, 1=matrix, 2=vector, 3=list
#' @param simplify_to \code{integer(1L)}, default: \code{0L}.
#' Maximum simplification level.
#' 0: data frame, 1: matrix, 2: vector, 3: list (no simplification)
#'
#' @param type_policy default: \code{0}. Level of type strictness.
#' 0=anything goes, 1=merge integers/doubles, 2=strict
#' @param type_policy \code{integer(1L)}, default: \code{0L}.
#' Level of type strictness.
#' 0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)
#'
#' @param int64_r_type default: \code{0} How to return big integers to R.
#' 0=double, 1=string, 2=bit64::integer64
#' @param int64_r_type \code{integer(1L)} default: \code{0L}
#' How to return big integers to R.
#' 0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number
#'
#' @details
#' Instead of using \code{lapply()} for vectors containing multiple strings/file paths,
#' just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized
#' (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can
#' reuse internal buffers between parses. Since the overwhelming majority of JSON objects
#' parsed will not result in R scalars, a \code{list()} is always returned when multiple items
#' are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with
#' \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will
#' have the same names.
#'
#' @keywords internal
#'
#' @examples
#' # .deserialize_json() ======================================================
#' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]')
#'
#' RcppSimdJson:::.deserialize_json(
#' '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
#' )
#'
#' RcppSimdJson:::.deserialize_json(
#' c(
#' json1 = "[[1,2,3],[4,5,6]]",
#' json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
#' )
#' )
#'
.deserialize_json <- function(json, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) {
.Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
}

#' @rdname dot-deserialize_json
#'
#' @param file_path \code{character()} containing one or more paths to files containing
#' JSON data.
#'
#' @examples
#' # .load_json() =============================================================
#' single_file <- system.file("jsonexamples", "small", "flatadversarial.json",
#' package = "RcppSimdJson")
#' RcppSimdJson:::.load_json(single_file)
#'
#' multiple_files <- vapply(
#' c("flatadversarial.json", "adversarial.json"),
#' function(.x) {
#' system.file("jsonexamples/small", .x, package = "RcppSimdJson")
#' },
#' character(1L)
#' )
#' RcppSimdJson:::.load_json(multiple_files)
#'
.load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) {
.Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
}

.exceptions_enabled <- function() {
.Call(`_RcppSimdJson_exceptions_enabled`)
}
Expand Down
62 changes: 0 additions & 62 deletions inst/include/RcppSimdJson.hpp
Original file line number Diff line number Diff line change
@@ -1,68 +1,6 @@
#ifndef RCPPSIMDJSON_HPP
#define RCPPSIMDJSON_HPP

#define STRICT_R_HEADERS
#include <Rcpp.h>


namespace rcppsimdjson {

static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN;


enum class rcpp_T : int {
array = 0,
object = 1,
chr = 2,
u64 = 3,
dbl = 4,
i64 = 5,
i32 = 6,
lgl = 7,
null = 8,
};


template <rcpp_T R_Type> static inline constexpr auto na_val() {
if constexpr (R_Type == rcpp_T::chr) {
return NA_STRING;
}
if constexpr (R_Type == rcpp_T::dbl) {
return NA_REAL;
}
if constexpr (R_Type == rcpp_T::i64) {
return NA_INTEGER64;
}
if constexpr (R_Type == rcpp_T::i32) {
return NA_INTEGER;
}
if constexpr (R_Type == rcpp_T::lgl) {
return NA_LOGICAL;
}
}


// #define SIMDJSON_EXCEPTIONS 0
#ifdef SIMDJSON_EXCEPTIONS
#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS
static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1;
#else
#define RCPPSIMDJSON_EXCEPTIONS 1
static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
#endif


static inline constexpr auto is_no_except(rcpp_T R_Type) -> bool {
// all scalars seem to be extractable w/o touching throwing code except for strings
return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr;
}


} // namespace rcppsimdjson

#include <simdjson.h>

#include "RcppSimdJson/utils.hpp"
#include "RcppSimdJson/deserialize.hpp"

#endif
151 changes: 151 additions & 0 deletions inst/include/RcppSimdJson/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#ifndef RCPPSIMDJSON_COMMON_HPP
#define RCPPSIMDJSON_COMMON_HPP


#define STRICT_R_HEADERS
#include <Rcpp.h>


namespace rcppsimdjson {

/**
* @brief A container's size as an @c R_xlen_t @c. Otherwise Equivalent to @c std::size() @c.
*/
template <typename _Container>
inline constexpr auto r_length(const _Container& __cont) noexcept -> R_xlen_t {
return static_cast<R_xlen_t>(std::size(__cont));
}


/**
* @brief A @c bit64::integer64 @c-compatible @c NA @c.
*/
static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN;


/**
* @brief Typing arguments that decide how a @c simdjson::dom::element is ultimately returned to R.
*/
enum class rcpp_T : int {
array = 0, /**< recursive: individual elements will decide ultimate R type */
object = 1, /**< recursive: individual elements will decide ultimate R type */
chr = 2, /**< always becomes @c Rcpp::String / @c character */
u64 = 3, /**< always becomes @c Rcpp::String / @c character */
dbl = 4, /**< always becomes @c double */
i64 = 5, /**< follows @c Int64_R_Type: @c double, @c character, or @c bit64::integer64 */
i32 = 6, /**< always becomes @c int */
lgl = 7, /**< always becomes @c bool / @c logical */
null = 8, /**< becomes @c NA if returned in a vector, else @c NULL */
};


/**
* @brief Get a typed @c NA @c.
*/
template <rcpp_T R_Type> static inline constexpr auto na_val() {
if constexpr (R_Type == rcpp_T::chr) {
return NA_STRING;
}
if constexpr (R_Type == rcpp_T::dbl) {
return NA_REAL;
}
if constexpr (R_Type == rcpp_T::i64) {
return NA_INTEGER64;
}
if constexpr (R_Type == rcpp_T::i32) {
return NA_INTEGER;
}
if constexpr (R_Type == rcpp_T::lgl) {
return NA_LOGICAL;
}
}


/**
* Internal flags tracking whether simdjson is compiled with exceptions enabled (the default).
* If simdjson is compiled w/o exceptions ( @c #define SIMDJSON_EXCEPTIONS 0 @c), operations that
* do not touch throwing code can be annotated with keyword @c noexcept where appropriate.
*/
// #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */
#ifdef SIMDJSON_EXCEPTIONS
#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS
static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1;
#else
#define RCPPSIMDJSON_EXCEPTIONS 1
static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
#endif


/**
* @brief Whether a function is @c noexcept.
*
* If a function does not touch throwing code it can be annotated with @c noexcept().
* If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not
* @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept
* when compiled.
*
* Currently, @c rccp_T::chr touches throwing code so functions using it will always be
* @c noexcept(false).
*
* Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}.
*/
static inline constexpr auto is_no_except(rcpp_T R_Type) // NOLINT(clang-diagnostic-unused-function)
-> bool {
return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr;
}


namespace deserialize {


/**
* @brief Determines level of type strictness in combining array elements into R vectors.
*
* When arrays are not homogeneous and @c Type_Policy::anything_goes is used, type promotion follows
* R's behavior.
*/
enum class Type_Policy : int {
anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */
ints_as_dbls = 1, /* Non-recursive arrays of only numbers are promoted to highest type */
strict = 2, /* No type promotion */
};


/**
* @brief Maximum simplification level.
*/
enum class Simplify_To : int {
data_frame = 0, /* If possible, return dataframes. Otherwise return matrices/vectors/lists. */
matrix = 1, /* If possible, return matrices. Otherwise return vectors/lists. */
vector = 2, /* If possible, return vectors. Otherwise return lists. */
list = 3, /* No simplification. */
};


} // namespace deserialize
} // namespace rcppsimdjson


#include "../simdjson.h"
#include "utils.hpp"


namespace rcppsimdjson {
namespace deserialize {


/**
* @brief Simplify a @c simdjson::dom::element to an R object.
*
* @note Forward declaration. See @file inst/include/RcppSimdJson/deserialize/simplify.hpp @file.
*/
template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object)
-> SEXP;


} // namespace deserialize
} // namespace rcppsimdjson


#endif
30 changes: 27 additions & 3 deletions inst/include/RcppSimdJson/deserialize.hpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,43 @@
#ifndef RCPPSIMDJSON__DESERIALIZE_HPP
#define RCPPSIMDJSON__DESERIALIZE_HPP


#include "deserialize/simplify.hpp"


namespace rcppsimdjson {
namespace deserialize {

// THE GREAT DISPATCHER

/**
* @brief Deserialize a parsed @c simdjson::dom::element to R objects.
*
*
* @param element @c simdjson::dom::element to deserialize.
*
* @param empty_array R object to return when encountering an empty JSON array.
*
* @param empty_object R object to return when encountering an empty JSON object.
*
* @param type_policy @c Type_Policy specifying type strictness in combining mixed-type array
* elements into R vectors.
*
* @param int64_opt @c Int64_R_Type specifying how big integers are returned to R.
*
* @param simplify_to @c Simplify_To specifying the maximum level of simplification.
*
*
* @return The simplified R object ( @c SEXP ).
*/
inline auto deserialize(const simdjson::dom::element parsed,
const SEXP empty_array,
const SEXP empty_object,
SEXP empty_array,
SEXP empty_object,
const Simplify_To simplify_to,
const Type_Policy type_policy,
const utils::Int64_R_Type int64_opt) -> SEXP {
using Int64_R_Type = utils::Int64_R_Type;

// THE GREAT DISPATCHER
switch (type_policy) {
case Type_Policy::anything_goes: {
switch (int64_opt) {
Expand Down Expand Up @@ -248,4 +271,5 @@ inline auto deserialize(const simdjson::dom::element parsed,
} // namespace deserialize
} // namespace rcppsimdjson


#endif