Merge pull request #27 from eddelbuettel/feature/simdjson_0.4.1

simdjson 0.4.1 (closes #26) (closes #14)
eddelbuettel · Jun 27, 2020 · 5b699ea · 5b699ea
2 parents 20338d3 + db24b82
commit 5b699ea
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 34 deletions.
diff --git a/inst/include/simdjson.cpp b/inst/include/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Tue 23 Jun 2020 20:51:12 EDT. Do not edit! */
+/* auto-generated on Fri 26 Jun 2020 20:03:28 EDT. Do not edit! */
 /* begin file src/simdjson.cpp */
 #include "simdjson.h"
 
@@ -213,6 +213,10 @@ static inline uint32_t detect_supported_architectures() {
 /* begin file src/simdprune_tables.h */
 #ifndef SIMDJSON_SIMDPRUNE_TABLES_H
 #define SIMDJSON_SIMDPRUNE_TABLES_H
+
+
+#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
+
 #include <cstdint>
 
 namespace simdjson { // table modified and copied from
@@ -340,6 +344,8 @@ static const uint64_t thintable_epi8[256] = {
 
 } // namespace simdjson 
 
+
+#endif //  SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE
 #endif // SIMDJSON_SIMDPRUNE_TABLES_H
 /* end file src/simdprune_tables.h */
 
@@ -934,9 +940,8 @@ struct value128 {
   uint64_t high;
 };
 
-#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) &&                                 \
-    !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
-// this is a slow emulation routine for 32-bit Windows
+#ifdef SIMDJSON_IS_32BITS // _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit
 //
 static inline uint64_t __emulu(uint32_t x, uint32_t y) {
   return x * (uint64_t)y;
@@ -955,15 +960,15 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
 
 really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
   value128 answer;
-#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
 #ifdef _M_ARM64
   // ARM64 has native support for 64-bit multiplications, no need to emultate
   answer.high = __umulh(value1, value2);
   answer.low = value1 * value2;
 #else
   answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
 #endif // _M_ARM64
-#else // SIMDJSON_REGULAR_VISUAL_STUDIO
+#else // defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
   __uint128_t r = ((__uint128_t)value1) * value2;
   answer.low = uint64_t(r);
   answer.high = uint64_t(r >> 64);
@@ -4038,7 +4043,15 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
   // It was described in
   // Clinger WD. How to read floating point numbers accurately.
   // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
   if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
     // convert the integer into a double. This is lossless since
     // 0 <= i <= 2^53 - 1.
     double d = double(i);
@@ -6195,7 +6208,15 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
   // It was described in
   // Clinger WD. How to read floating point numbers accurately.
   // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
   if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
     // convert the integer into a double. This is lossless since
     // 0 <= i <= 2^53 - 1.
     double d = double(i);
@@ -7826,7 +7847,7 @@ really_inline bool add_overflow(uint64_t value1, uint64_t value2,
 #endif
 }
 
-#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
 #pragma intrinsic(_umul128)
 #endif
 really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
@@ -9499,7 +9520,15 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
   // It was described in
   // Clinger WD. How to read floating point numbers accurately.
   // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
   if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
     // convert the integer into a double. This is lossless since
     // 0 <= i <= 2^53 - 1.
     double d = double(i);
@@ -11114,7 +11143,7 @@ really_inline bool add_overflow(uint64_t value1, uint64_t value2,
 #endif
 }
 
-#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) || defined(SIMDJSON_IS_32BITS)
 #pragma intrinsic(_umul128)
 #endif
 really_inline bool mul_overflow(uint64_t value1, uint64_t value2,
@@ -12777,7 +12806,15 @@ really_inline double compute_float_64(int64_t power, uint64_t i, bool negative,
   // It was described in
   // Clinger WD. How to read floating point numbers accurately.
   // ACM SIGPLAN Notices. 1990
+#ifndef FLT_EVAL_METHOD
+#error "FLT_EVAL_METHOD should be defined, please include cfloat."
+#endif
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  // We cannot be certain that x/y is rounded to nearest.
+  if (0 <= power && power <= 22 && i <= 9007199254740991) {
+#else
   if (-22 <= power && power <= 22 && i <= 9007199254740991) {
+#endif
     // convert the integer into a double. This is lossless since
     // 0 <= i <= 2^53 - 1.
     double d = double(i);

diff --git a/inst/include/simdjson.h b/inst/include/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on Tue 23 Jun 2020 20:51:12 EDT. Do not edit! */
+/* auto-generated on Fri 26 Jun 2020 20:03:28 EDT. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
@@ -58,6 +58,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <cfloat>
 
 
 #ifdef _MSC_VER
@@ -91,24 +92,28 @@
 
 #if defined(__x86_64__) || defined(_M_AMD64)
 #define SIMDJSON_IS_X86_64 1
-#endif
-#if defined(__aarch64__) || defined(_M_ARM64)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #define SIMDJSON_IS_ARM64 1
+#else 
+#define SIMDJSON_IS_32BITS 1
+
+// We do not support 32-bit platforms, but it can be
+// handy to identify them.
+#if defined(_M_IX86) || defined(__i386__)
+#define SIMDJSON_IS_X86_32BITS 1
+#elif defined(__arm__) || defined(_M_ARM)
+#define SIMDJSON_IS_ARM_32BITS 1
 #endif
 
-#if (!defined(SIMDJSON_IS_X86_64)) && (!defined(SIMDJSON_IS_ARM64))
-#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
-#pragma message("The simdjson library is designed\
- for 64-bit processors and it seems that you are not \
+#endif // defined(__x86_64__) || defined(_M_AMD64)
+
+#ifdef SIMDJSON_IS_32BITS
+#pragma message("The simdjson library is designed \
+for 64-bit processors and it seems that you are not \
 compiling for a known 64-bit platform. All fast kernels \
 will be disabled and performance may be poor. Please \
 use a 64-bit target such as x64 or 64-bit ARM.")
-#else
-#error "The simdjson library is designed\
- for 64-bit processors. It seems that you are not \
-compiling for a known 64-bit platform."
-#endif
-#endif // (!defined(SIMDJSON_IS_X86_64)) && (!defined(SIMDJSON_IS_ARM64))
+#endif // SIMDJSON_IS_32BITS
 
 // this is almost standard?
 #undef STRINGIFY_IMPLEMENTATION_
@@ -128,6 +133,15 @@ compiling for a known 64-bit platform."
 #define SIMDJSON_IMPLEMENTATION_WESTMERE 0
 #endif // SIMDJSON_IS_ARM64
 
+// Our fast kernels require 64-bit systems.
+//
+// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions. 
+// Furthermore, the number of SIMD registers is reduced. 
+//
+// On 32-bit ARM, we would have smaller registers.
+//
+// The simdjson users should still have the fallback kernel. It is 
+// slower, but it should run everywhere.
 #if SIMDJSON_IS_X86_64
 #ifndef SIMDJSON_IMPLEMENTATION_HASWELL
 #define SIMDJSON_IMPLEMENTATION_HASWELL 1
@@ -138,7 +152,7 @@ compiling for a known 64-bit platform."
 #define SIMDJSON_IMPLEMENTATION_ARM64 0
 #endif // SIMDJSON_IS_X86_64
 
-// we are going to use runtime dispatch
+// We are going to use runtime dispatch.
 #ifdef SIMDJSON_IS_X86_64
 #ifdef __clang__
 // clang does not have GCC push pop
@@ -2018,7 +2032,7 @@ SIMDJSON_DISABLE_UNDESIRED_WARNINGS
 #define SIMDJSON_SIMDJSON_VERSION_H
 
 /** The version of simdjson being used (major.minor.revision) */
-#define SIMDJSON_VERSION 0.4.0
+#define SIMDJSON_VERSION 0.4.1
 
 namespace simdjson {
 enum {
@@ -2033,7 +2047,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdjson being used.
    */
-  SIMDJSON_VERSION_REVISION = 0
+  SIMDJSON_VERSION_REVISION = 1
 };
 } // namespace simdjson
 
@@ -2632,7 +2646,6 @@ inline error_code dom_parser_implementation::allocate(size_t capacity, size_t ma
 
 #endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
 /* end file include/simdjson/internal/dom_parser_implementation.h */
-#include <optional>
 #include <string>
 #include <atomic>
 #include <vector>
@@ -4008,21 +4021,42 @@ class element {
    */
   inline simdjson_result<object> get_object() const noexcept;
   /**
-   * Cast this element to a string.
+   * Cast this element to a null-terminated C string. 
+   * 
+   * The string is guaranteed to be valid UTF-8.
+   *
+   * The get_c_str() function is equivalent to get<const char *>().
+   * 
+   * The length of the string is given by get_string_length(). Because JSON strings
+   * may contain null characters, it may be incorrect to use strlen to determine the 
+   * string length.
    *
-   * Equivalent to get<const char *>().
+   * It is possible to get a single string_view instance which represents both the string
+   * content and its length: see get_string().
    *
-   * @returns An pointer to a null-terminated string. This string is stored in the parser and will
+   * @returns A pointer to a null-terminated UTF-8 string. This string is stored in the parser and will
    *          be invalidated the next time it parses a document or when it is destroyed.
    *          Returns INCORRECT_TYPE if the JSON element is not a string.
    */
   inline simdjson_result<const char *> get_c_str() const noexcept;
   /**
-   * Cast this element to a string.
+   * Gives the length in bytes of the string.
+   * 
+   * It is possible to get a single string_view instance which represents both the string
+   * content and its length: see get_string().
+   *
+   * @returns A string length in bytes.
+   *          Returns INCORRECT_TYPE if the JSON element is not a string.
+   */
+  inline simdjson_result<size_t> get_string_length() const noexcept;
+  /**
+   * Cast this element to a string. 
+   * 
+   * The string is guaranteed to be valid UTF-8.
    *
    * Equivalent to get<std::string_view>().
    *
-   * @returns A string. The string is stored in the parser and will be invalidated the next time it
+   * @returns An UTF-8 string. The string is stored in the parser and will be invalidated the next time it
    *          parses a document or when it is destroyed.
    *          Returns INCORRECT_TYPE if the JSON element is not a string.
    */
@@ -4199,7 +4233,9 @@ class element {
   inline operator bool() const noexcept(false);
 
   /**
-   * Read this element as a null-terminated string.
+   * Read this element as a null-terminated UTF-8 string.
+   * 
+   * Be mindful that JSON allows strings to contain null characters.
    *
    * Does *not* convert other types to a string; requires that the JSON type of the element was
    * an actual string.
@@ -4210,7 +4246,7 @@ class element {
   inline explicit operator const char*() const noexcept(false);
 
   /**
-   * Read this element as a null-terminated string.
+   * Read this element as a null-terminated UTF-8 string.
    *
    * Does *not* convert other types to a string; requires that the JSON type of the element was
    * an actual string.
@@ -4410,6 +4446,7 @@ struct simdjson_result<dom::element> : public internal::simdjson_result_base<dom
   really_inline simdjson_result<dom::array> get_array() const noexcept;
   really_inline simdjson_result<dom::object> get_object() const noexcept;
   really_inline simdjson_result<const char *> get_c_str() const noexcept;
+  really_inline simdjson_result<size_t> get_string_length() const noexcept;
   really_inline simdjson_result<std::string_view> get_string() const noexcept;
   really_inline simdjson_result<int64_t> get_int64() const noexcept;
   really_inline simdjson_result<uint64_t> get_uint64() const noexcept;
@@ -5820,6 +5857,10 @@ really_inline simdjson_result<const char *> simdjson_result<dom::element>::get_c
   if (error()) { return error(); }
   return first.get_c_str();
 }
+really_inline simdjson_result<size_t> simdjson_result<dom::element>::get_string_length() const noexcept {
+  if (error()) { return error(); }
+  return first.get_string_length();
+}
 really_inline simdjson_result<std::string_view> simdjson_result<dom::element>::get_string() const noexcept {
   if (error()) { return error(); }
   return first.get_string();
@@ -5960,6 +6001,15 @@ inline simdjson_result<const char *> element::get_c_str() const noexcept {
       return INCORRECT_TYPE;
   }
 }
+inline simdjson_result<size_t> element::get_string_length() const noexcept {
+  switch (tape.tape_ref_type()) {
+    case internal::tape_type::STRING: {
+      return tape.get_string_length();
+    }
+    default:
+      return INCORRECT_TYPE;
+  }
+}
 inline simdjson_result<std::string_view> element::get_string() const noexcept {
   switch (tape.tape_ref_type()) {
     case internal::tape_type::STRING:
@@ -7610,14 +7660,14 @@ really_inline T tape_ref::next_tape_value() const noexcept {
 }
 
 really_inline uint32_t internal::tape_ref::get_string_length() const noexcept {
-  uint64_t string_buf_index = size_t(tape_value());
+  size_t string_buf_index = size_t(tape_value());
   uint32_t len;
   memcpy(&len, &doc->string_buf[string_buf_index], sizeof(len));
   return len;
 }
 
 really_inline const char * internal::tape_ref::get_c_str() const noexcept {
-  uint64_t string_buf_index = size_t(tape_value());
+  size_t string_buf_index = size_t(tape_value());
   return reinterpret_cast<const char *>(&doc->string_buf[string_buf_index + sizeof(uint32_t)]);
 }