From 70adc7b24faa90036343f77c9adf2568ce682882 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 08:13:25 -0700
Subject: [PATCH 01/16] fix bad type coercion (int64_t to double)

---
 inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
index e272e20..e92e05e 100644
--- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
+++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
@@ -231,6 +231,7 @@ inline constexpr auto Type_Doctor<Type_Policy::ints_as_dbls>::common_R_type() co
     return rcpp_T::dbl;
   }
   if (i64_ && !(i32_ || lgl_ || u64_)) {
+  if (i64_ && !(lgl_ || u64_)) {
     // only 64/32-bit integers: will follow selected Int64_R_Type option
     return rcpp_T::i64;
   }

From 47b770ce3226d34d9492a4ad0575eddd4ee43c66 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 08:20:59 -0700
Subject: [PATCH 02/16] move globals/macros to
 inst/include/RcppSimdJson/common.hpp and add documentation

---
 inst/include/RcppSimdJson.hpp                 |  62 ---------
 inst/include/RcppSimdJson/common.hpp          | 121 ++++++++++++++++++
 .../RcppSimdJson/deserialize/Type_Doctor.hpp  |  13 +-
 .../RcppSimdJson/deserialize/scalar.hpp       |   2 +-
 .../RcppSimdJson/deserialize/simplify.hpp     |  20 +--
 5 files changed, 130 insertions(+), 88 deletions(-)
 create mode 100644 inst/include/RcppSimdJson/common.hpp

diff --git a/inst/include/RcppSimdJson.hpp b/inst/include/RcppSimdJson.hpp
index cef0285..1c0f8b8 100644
--- a/inst/include/RcppSimdJson.hpp
+++ b/inst/include/RcppSimdJson.hpp
@@ -1,68 +1,6 @@
 #ifndef RCPPSIMDJSON_HPP
 #define RCPPSIMDJSON_HPP
 
-#define STRICT_R_HEADERS
-#include <Rcpp.h>
-
-
-namespace rcppsimdjson {
-
-static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN;
-
-
-enum class rcpp_T : int {
-  array = 0,
-  object = 1,
-  chr = 2,
-  u64 = 3,
-  dbl = 4,
-  i64 = 5,
-  i32 = 6,
-  lgl = 7,
-  null = 8,
-};
-
-
-template <rcpp_T R_Type> static inline constexpr auto na_val() {
-  if constexpr (R_Type == rcpp_T::chr) {
-    return NA_STRING;
-  }
-  if constexpr (R_Type == rcpp_T::dbl) {
-    return NA_REAL;
-  }
-  if constexpr (R_Type == rcpp_T::i64) {
-    return NA_INTEGER64;
-  }
-  if constexpr (R_Type == rcpp_T::i32) {
-    return NA_INTEGER;
-  }
-  if constexpr (R_Type == rcpp_T::lgl) {
-    return NA_LOGICAL;
-  }
-}
-
-
-// #define SIMDJSON_EXCEPTIONS 0
-#ifdef SIMDJSON_EXCEPTIONS
-#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS
-static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1;
-#else
-#define RCPPSIMDJSON_EXCEPTIONS 1
-static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
-#endif
-
-
-static inline constexpr auto is_no_except(rcpp_T R_Type) -> bool {
-  // all scalars seem to be extractable w/o touching throwing code except for strings
-  return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr;
-}
-
-
-} // namespace rcppsimdjson
-
-#include <simdjson.h>
-
-#include "RcppSimdJson/utils.hpp"
 #include "RcppSimdJson/deserialize.hpp"
 
 #endif
diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp
new file mode 100644
index 0000000..44b88da
--- /dev/null
+++ b/inst/include/RcppSimdJson/common.hpp
@@ -0,0 +1,121 @@
+#ifndef RCPPSIMDJSON_COMMON_HPP
+#define RCPPSIMDJSON_COMMON_HPP
+
+
+#define STRICT_R_HEADERS
+#include <Rcpp.h>
+
+
+namespace rcppsimdjson {
+
+/*
+ * `bit64::integer64`-compatible `NA`
+ */
+static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN;
+
+
+/*
+ * Typing arguments that decide how `simdjson::dom::element`s are ultimate return to R.
+ */
+enum class rcpp_T : int {
+  array = 0,  /* recursive: individual elements will decide ultimate R type */
+  object = 1, /* recursive: individual elements will decide ultimate R type */
+  chr = 2,    /* always becomes `Rcpp::String`/`character(1L)` */
+  u64 = 3,    /* always becomes `Rcpp::String`/`character(1L)` */
+  dbl = 4,    /* always becomes `double` */
+  i64 = 5,    /* follows Int64_R_Type: `double`, `character(1L)`, or `bit64::integer64` */
+  i32 = 6,    /* always becomes `int` */
+  lgl = 7,    /* always becomes `bool */
+  null = 8,   /* becomes `NA` if returned in a vector, else `NULL */
+};
+
+
+/*
+ * Generic, typed `NA` inserter.
+ */
+template <rcpp_T R_Type> static inline constexpr auto na_val() {
+  if constexpr (R_Type == rcpp_T::chr) {
+    return NA_STRING;
+  }
+  if constexpr (R_Type == rcpp_T::dbl) {
+    return NA_REAL;
+  }
+  if constexpr (R_Type == rcpp_T::i64) {
+    return NA_INTEGER64;
+  }
+  if constexpr (R_Type == rcpp_T::i32) {
+    return NA_INTEGER;
+  }
+  if constexpr (R_Type == rcpp_T::lgl) {
+    return NA_LOGICAL;
+  }
+}
+
+
+/*
+ * Internal flags tracking whether simdjson is compiled with exceptions enabled (the default).
+ * If simdjson is compiled w/o exceptions (`#define SIMDJSON_EXCEPTIONS 0`), operations that do not
+ * touch throwing code can be annotated with keyword `noexcept` where appropriate.
+ * See inst/include/RcppSimdJson/deserialize/scalar.hpp for examples.
+ */
+// #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */
+#ifdef SIMDJSON_EXCEPTIONS
+#define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS
+static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1;
+#else
+#define RCPPSIMDJSON_EXCEPTIONS 1 // NOLINT(cppcoreguidelines-macro-usage)
+static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
+#endif
+
+
+/*
+ * All scalar-getter functions are annotated with `is_no_except()`, which will be false if
+ * `RCPPSIMDJSON_NO_EXCEPTIONS` is enabled and their `rcpp_T` template argument is not
+ * `rcpp_T::chr` (strings are not currently extractable w/o touching throwing code).
+ */
+static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool {
+  return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr;
+}
+
+
+namespace deserialize {
+
+
+static inline constexpr bool HAS_NULLS = true;
+static inline constexpr bool NO_NULLS = false;
+
+/*
+ * Determines level of type strictness in combining array elements into R vectors.
+ *
+ * When arrays are not homogeneous and `Type_Policy::anything_goes` is used, type promotion follows
+ * R's behavior.
+ */
+enum class Type_Policy : int {
+  anything_goes = 0, /* Non-recursive arrays always become vectors */
+  ints_as_dbls = 1,  /* Combines `rcpp_T::i32`s, `::i64`s, and `::dbl`s */
+  strict = 2,        /* No type promotions */
+};
+
+
+/*
+ * Maximum simplification level. `Simplify_To::list` results in no simplification.
+ */
+enum class Simplify_To : int {
+  data_frame = 0,
+  matrix = 1,
+  vector = 2,
+  list = 3,
+};
+
+
+} // namespace deserialize
+
+
+} // namespace rcppsimdjson
+
+
+#include <simdjson.h>
+#include "RcppSimdJson/utils.hpp"
+
+
+#endif
\ No newline at end of file
diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
index e92e05e..c692953 100644
--- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
+++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
@@ -1,16 +1,18 @@
 #ifndef RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP
 #define RCPPSIMDJSON__DESERIALIZE__TYPE_DOCTOR_HPP
 
+#include "../common.hpp"
+
 
 namespace rcppsimdjson {
 namespace deserialize {
 
 
-enum class Type_Policy : int {
-  anything_goes = 0,
-  ints_as_dbls = 1,
-  strict = 2,
-};
+// enum class Type_Policy : int {
+//   anything_goes = 0,
+//   ints_as_dbls = 1,
+//   strict = 2,
+// };
 
 
 template <Type_Policy type_policy> class Type_Doctor {
@@ -230,7 +232,6 @@ inline constexpr auto Type_Doctor<Type_Policy::ints_as_dbls>::common_R_type() co
   if (dbl_ && !(lgl_ || u64_)) { // any number will become double
     return rcpp_T::dbl;
   }
-  if (i64_ && !(i32_ || lgl_ || u64_)) {
   if (i64_ && !(lgl_ || u64_)) {
     // only 64/32-bit integers: will follow selected Int64_R_Type option
     return rcpp_T::i64;
diff --git a/inst/include/RcppSimdJson/deserialize/scalar.hpp b/inst/include/RcppSimdJson/deserialize/scalar.hpp
index 258478b..78acfdf 100644
--- a/inst/include/RcppSimdJson/deserialize/scalar.hpp
+++ b/inst/include/RcppSimdJson/deserialize/scalar.hpp
@@ -63,7 +63,7 @@ inline auto get_scalar_<int64_t, rcpp_T::chr>(simdjson::dom::element element) no
 template <>
 inline auto get_scalar_<int64_t, rcpp_T::dbl>(simdjson::dom::element element) noexcept(
     is_no_except(rcpp_T::dbl)) {
-  return static_cast<double>(element.get<int64_t>().first);
+  return element.get<double>().first;
 }
 // return int64_t
 template <>
diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index 096ef61..8e5a504 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -2,35 +2,17 @@
 #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP
 
 
-namespace rcppsimdjson {
-namespace deserialize {
-
-
-static inline constexpr bool HAS_NULLS = true;
-static inline constexpr bool NO_NULLS = false;
-
-} // namespace deserialize
-} // namespace rcppsimdjson
-
-
 #include "Type_Doctor.hpp"
 #include "scalar.hpp"
 #include "vector.hpp"
 #include "matrix.hpp"
 #include "dataframe.hpp"
 
+
 namespace rcppsimdjson {
 namespace deserialize {
 
 
-enum class Simplify_To : int {
-  data_frame = 0,
-  matrix = 1,
-  vector = 2,
-  list = 3,
-};
-
-
 // forward declaration
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_element(const simdjson::dom::element, const SEXP, const SEXP) -> SEXP;

From 891df4a8b7acaeed94e9b848bb7bfe86649b9ee7 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 08:55:37 -0700
Subject: [PATCH 03/16] move forward-declaration for simplify_element() to
 inst/include/RcppSimdJson/common.hpp so build_data_frame() can go where it
 should have been (inst/include/RcppSimdJson/deserialize/dataframe.hpp)

---
 inst/include/RcppSimdJson/common.hpp          | 14 ++++
 .../RcppSimdJson/deserialize/dataframe.hpp    | 74 +++++++++++++++++
 .../RcppSimdJson/deserialize/simplify.hpp     | 81 +------------------
 3 files changed, 90 insertions(+), 79 deletions(-)

diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp
index 44b88da..71a9d86 100644
--- a/inst/include/RcppSimdJson/common.hpp
+++ b/inst/include/RcppSimdJson/common.hpp
@@ -118,4 +118,18 @@ enum class Simplify_To : int {
 #include "RcppSimdJson/utils.hpp"
 
 
+namespace rcppsimdjson {
+namespace deserialize {
+
+
+// forward declaration: definition in inst/include/RcppSimdJson/deserialize/simplify.hpp
+template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
+inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object)
+    -> SEXP;
+
+
+} // namespace deserialize
+} // namespace rcppsimdjson
+
+
 #endif
\ No newline at end of file
diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
index 0a8a0e2..9cd01e1 100644
--- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp
+++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
@@ -159,6 +159,80 @@ inline auto build_col_integer64(const simdjson::dom::array array,
 }
 
 
+
+template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
+inline auto build_data_frame(const simdjson::dom::array array,
+                             const std::map<std::string_view, Column<type_policy>>& cols,
+                             SEXP empty_array,
+                             SEXP empty_object) -> SEXP {
+
+  const auto n_rows = R_xlen_t(std::size(array));
+  auto out = Rcpp::List(std::size(cols));
+  auto out_names = Rcpp::CharacterVector(std::size(cols));
+
+  for (auto [key, col] : cols) {
+    out_names[col.index] = std::string(key);
+
+    switch (col.schema.common_R_type()) {
+      case rcpp_T::chr: {
+        out[col.index] =
+            build_col<STRSXP, std::string, rcpp_T::chr, type_policy>(array, key, col.schema);
+        break;
+      }
+
+      case rcpp_T::dbl: {
+        out[col.index] =
+            build_col<REALSXP, double, rcpp_T::dbl, type_policy>(array, key, col.schema);
+        break;
+      }
+
+      case rcpp_T::i64: {
+        out[col.index] = build_col_integer64<type_policy, int64_opt>(array, key, col.schema);
+        break;
+      }
+
+      case rcpp_T::i32: {
+        out[col.index] =
+            build_col<INTSXP, int64_t, rcpp_T::i32, type_policy>(array, key, col.schema);
+        break;
+      }
+
+      case rcpp_T::lgl: {
+        out[col.index] = build_col<LGLSXP, bool, rcpp_T::lgl, type_policy>(array, key, col.schema);
+        break;
+      }
+
+      case rcpp_T::null: {
+        out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL);
+        break;
+      }
+
+      default: {
+        auto this_col = Rcpp::Vector<VECSXP>(n_rows);
+        auto i_row = R_xlen_t(0);
+        for (auto element : array) {
+          auto [value, error] = element.get<simdjson::dom::object>().at_key(key);
+          if (error) {
+            this_col[i_row++] = NA_LOGICAL;
+          } else {
+            this_col[i_row++] = simplify_element<type_policy, int64_opt, simplify_to>(
+                value, empty_array, empty_object //
+            );
+          }
+        }
+        out[col.index] = this_col;
+      }
+    }
+  }
+
+  out.attr("names") = out_names;
+  out.attr("row.names") = Rcpp::seq(1, n_rows);
+  out.attr("class") = "data.frame";
+
+  return out;
+}
+
+
 } // namespace deserialize
 } // namespace rcppsimdjson
 
diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index 8e5a504..2614895 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -1,7 +1,7 @@
 #ifndef RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP
 #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP
 
-
+#include "../common.hpp"
 #include "Type_Doctor.hpp"
 #include "scalar.hpp"
 #include "vector.hpp"
@@ -13,84 +13,6 @@ namespace rcppsimdjson {
 namespace deserialize {
 
 
-// forward declaration
-template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_element(const simdjson::dom::element, const SEXP, const SEXP) -> SEXP;
-
-
-template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto build_data_frame(const simdjson::dom::array array,
-                             const std::map<std::string_view, Column<type_policy>>& cols,
-                             const SEXP empty_array,
-                             const SEXP empty_object) -> SEXP {
-
-  const auto n_rows = R_xlen_t(std::size(array));
-  auto out = Rcpp::List(std::size(cols));
-  auto out_names = Rcpp::CharacterVector(std::size(cols));
-
-  for (auto [key, col] : cols) {
-    out_names[col.index] = std::string(key);
-
-    switch (col.schema.common_R_type()) {
-      case rcpp_T::chr: {
-        out[col.index] =
-            build_col<STRSXP, std::string, rcpp_T::chr, type_policy>(array, key, col.schema);
-        break;
-      }
-
-      case rcpp_T::dbl: {
-        out[col.index] =
-            build_col<REALSXP, double, rcpp_T::dbl, type_policy>(array, key, col.schema);
-        break;
-      }
-
-      case rcpp_T::i64: {
-        out[col.index] = build_col_integer64<type_policy, int64_opt>(array, key, col.schema);
-        break;
-      }
-
-      case rcpp_T::i32: {
-        out[col.index] =
-            build_col<INTSXP, int64_t, rcpp_T::i32, type_policy>(array, key, col.schema);
-        break;
-      }
-
-      case rcpp_T::lgl: {
-        out[col.index] = build_col<LGLSXP, bool, rcpp_T::lgl, type_policy>(array, key, col.schema);
-        break;
-      }
-
-      case rcpp_T::null: {
-        out[col.index] = Rcpp::LogicalVector(n_rows, NA_LOGICAL);
-        break;
-      }
-
-      default: {
-        auto this_col = Rcpp::Vector<VECSXP>(n_rows);
-        auto i_row = R_xlen_t(0);
-        for (auto element : array) {
-          auto [value, error] = element.get<simdjson::dom::object>().at_key(key);
-          if (error) {
-            this_col[i_row++] = NA_LOGICAL;
-          } else {
-            this_col[i_row++] = simplify_element<type_policy, int64_opt, simplify_to>(
-                value, empty_array, empty_object //
-            );
-          }
-        }
-        out[col.index] = this_col;
-      }
-    }
-  }
-
-  out.attr("names") = out_names;
-  out.attr("row.names") = Rcpp::seq(1, n_rows);
-  out.attr("class") = "data.frame";
-
-  return out;
-}
-
-
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_list(const simdjson::dom::array array,
                           const SEXP empty_array,
@@ -234,6 +156,7 @@ inline auto simplify_object(const simdjson::dom::object object,
 }
 
 
+// definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_element(const simdjson::dom::element element,
                              const SEXP empty_array,

From 89d5b34d049ba8f5c28c7cd23fb5bf6de31b689a Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 15:38:32 -0700
Subject: [PATCH 04/16] make all integer types explicit

---
 .../RcppSimdJson/deserialize/dataframe.hpp    | 11 +++++-----
 .../RcppSimdJson/deserialize/matrix.hpp       | 22 ++++++++++---------
 .../RcppSimdJson/deserialize/simplify.hpp     |  6 ++---
 .../RcppSimdJson/deserialize/vector.hpp       | 12 +++++-----
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
index 9cd01e1..9f897e5 100644
--- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp
+++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
@@ -8,7 +8,7 @@ namespace rcppsimdjson {
 namespace deserialize {
 
 template <Type_Policy type_policy> struct Column {
-  R_xlen_t index = 0;
+  R_xlen_t index = 0L;
   Type_Doctor<type_policy> schema = Type_Doctor<type_policy>();
 };
 
@@ -24,7 +24,7 @@ diagnose_data_frame(const simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_E
     -> std::optional<Column_Schema<type_policy>> {
 
   auto cols = Column_Schema<type_policy>();
-  auto col_index = 0;
+  auto col_index = R_xlen_t(0L);
 
   if (std::size(array) == 0) {
     return std::nullopt;
@@ -59,7 +59,7 @@ inline auto build_col(const simdjson::dom::array array,
                       const Type_Doctor<type_policy>& type_doc) -> Rcpp::Vector<RTYPE> {
 
   auto out = Rcpp::Vector<RTYPE>(std::size(array), na_val<R_Type>());
-  auto i_row = R_xlen_t(0);
+  auto i_row = R_xlen_t(0L);
 
   if (type_doc.is_homogeneous()) {
     if (type_doc.has_null()) {
@@ -112,7 +112,7 @@ inline auto build_col_integer64(const simdjson::dom::array array,
 
   if constexpr (int64_opt == utils::Int64_R_Type::Integer64) {
     auto stl_vec = std::vector<int64_t>(std::size(array), NA_INTEGER64);
-    auto i_row = std::size_t(0);
+    auto i_row = std::size_t(0ULL);
 
     if (type_doc.is_homogeneous()) {
       if (type_doc.has_null()) {
@@ -159,7 +159,6 @@ inline auto build_col_integer64(const simdjson::dom::array array,
 }
 
 
-
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto build_data_frame(const simdjson::dom::array array,
                              const std::map<std::string_view, Column<type_policy>>& cols,
@@ -209,7 +208,7 @@ inline auto build_data_frame(const simdjson::dom::array array,
 
       default: {
         auto this_col = Rcpp::Vector<VECSXP>(n_rows);
-        auto i_row = R_xlen_t(0);
+        auto i_row = R_xlen_t(0L);
         for (auto element : array) {
           auto [value, error] = element.get<simdjson::dom::object>().at_key(key);
           if (error) {
diff --git a/inst/include/RcppSimdJson/deserialize/matrix.hpp b/inst/include/RcppSimdJson/deserialize/matrix.hpp
index 074ca34..a08995c 100644
--- a/inst/include/RcppSimdJson/deserialize/matrix.hpp
+++ b/inst/include/RcppSimdJson/deserialize/matrix.hpp
@@ -47,11 +47,11 @@ inline auto diagnose(simdjson::dom::array array) noexcept(RCPPSIMDJSON_NO_EXCEPT
 
 
 template <int RTYPE, typename in_T, rcpp_T R_Type, bool has_nulls>
-inline auto build_matrix_typed(simdjson::dom::array array, const std::size_t n_cols)
+inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols)
     -> Rcpp::Vector<RTYPE> {
 
-  const auto n_rows = std::size(array);
-  auto out = Rcpp::Matrix<RTYPE>(n_rows, n_cols);
+  const auto n_rows = r_length(array);
+  auto out = Rcpp::Matrix<RTYPE>(n_rows, static_cast<R_xlen_t>(n_cols));
   auto j = R_xlen_t(0);
 
 #if RCPPSIMDJSON_EXCEPTIONS
@@ -106,7 +106,8 @@ inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std::
 #endif
 
   auto out = Rcpp::NumericVector(utils::as_integer64(stl_vec_int64));
-  out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols);
+  out.attr("dim") = Rcpp::IntegerVector::create(static_cast<R_xlen_t>(n_rows), //
+                                                static_cast<R_xlen_t>(n_cols));
 
   return out;
 }
@@ -159,7 +160,7 @@ inline auto dispatch_typed(const simdjson::dom::array array,
                        : build_matrix_typed<LGLSXP, bool, rcpp_T::lgl, NO_NULLS>(array, n_cols);
 
     case simdjson::dom::element_type::NULL_VALUE:
-      return Rcpp::LogicalVector(std::size(array), NA_LOGICAL);
+      return Rcpp::LogicalVector(r_length(array), NA_LOGICAL);
 
     case simdjson::dom::element_type::UINT64:
       return has_nulls ? build_matrix_typed<STRSXP, uint64_t, rcpp_T::chr, HAS_NULLS>(array, n_cols)
@@ -173,8 +174,8 @@ inline auto dispatch_typed(const simdjson::dom::array array,
 template <int RTYPE>
 inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size_t n_cols) -> SEXP {
 
-  const auto n_rows = std::size(array);
-  Rcpp::Matrix<RTYPE> out(n_rows, n_cols);
+  const auto n_rows = r_length(array);
+  Rcpp::Matrix<RTYPE> out(n_rows, static_cast<R_xlen_t>(n_cols));
 
   auto j = R_xlen_t(0);
 
@@ -207,7 +208,7 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std::
 
   const auto n_rows = std::size(array);
   auto stl_vec_int64 = std::vector<int64_t>(n_rows * n_cols);
-  auto j = std::size_t(0);
+  auto j = std::size_t(0ULL);
 
 #if RCPPSIMDJSON_EXCEPTIONS
   for (simdjson::dom::array sub_array : array) {
@@ -253,7 +254,8 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std::
 
 
   auto out = Rcpp::Vector<REALSXP>(utils::as_integer64(stl_vec_int64));
-  out.attr("dim") = Rcpp::IntegerVector::create(n_rows, n_cols);
+  out.attr("dim") = Rcpp::IntegerVector::create(static_cast<R_xlen_t>(n_rows), //
+                                                static_cast<R_xlen_t>(n_cols));
 
   return out;
 }
@@ -294,7 +296,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array,
       return build_matrix_mixed<STRSXP>(array, n_cols);
 
     default: {
-      auto out = Rcpp::LogicalMatrix(std::size(array), n_cols);
+      auto out = Rcpp::LogicalMatrix(r_length(array), static_cast<R_xlen_t>(n_cols));
       out.fill(NA_LOGICAL);
       return out;
     }
diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index 2614895..ebcc4b2 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -17,7 +17,7 @@ template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To si
 inline auto simplify_list(const simdjson::dom::array array,
                           const SEXP empty_array,
                           const SEXP empty_object) -> SEXP {
-  Rcpp::List out(std::size(array));
+  Rcpp::List out(r_length(array));
 
   auto i = R_xlen_t(0);
   for (auto element : array) {
@@ -136,7 +136,7 @@ template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To si
 inline auto simplify_object(const simdjson::dom::object object,
                             const SEXP empty_array,
                             const SEXP empty_object) -> SEXP {
-  const auto n = R_xlen_t(std::size(object));
+  const auto n = r_length(object);
   if (n == 0) {
     return empty_object;
   }
@@ -144,7 +144,7 @@ inline auto simplify_object(const simdjson::dom::object object,
   Rcpp::List out(n);
   Rcpp::CharacterVector out_names(n);
 
-  auto i = R_xlen_t(0);
+  auto i = R_xlen_t(0L);
   for (auto [key, value] : object) {
     out[i] =
         simplify_element<type_policy, int64_opt, simplify_to>(value, empty_array, empty_object);
diff --git a/inst/include/RcppSimdJson/deserialize/vector.hpp b/inst/include/RcppSimdJson/deserialize/vector.hpp
index e747af9..7771044 100644
--- a/inst/include/RcppSimdJson/deserialize/vector.hpp
+++ b/inst/include/RcppSimdJson/deserialize/vector.hpp
@@ -9,7 +9,7 @@ namespace vector {
 
 template <int RTYPE, typename in_T, rcpp_T R_Type, bool has_nulls>
 inline auto build_vector_typed(const simdjson::dom::array array) -> Rcpp::Vector<RTYPE> {
-  auto out = Rcpp::Vector<RTYPE>(std::size(array));
+  auto out = Rcpp::Vector<RTYPE>(r_length(array));
   auto i = R_xlen_t(0);
   for (auto element : array) {
     out[i++] = get_scalar<in_T, R_Type, has_nulls>(element);
@@ -23,7 +23,7 @@ inline auto build_vector_integer64_typed(const simdjson::dom::array array)
     -> Rcpp::Vector<REALSXP> {
 
   auto stl_vec_int64 = std::vector<int64_t>(std::size(array));
-  auto i = std::size_t(0);
+  auto i = std::size_t(0ULL);
   for (auto element : array) {
     stl_vec_int64[i++] = get_scalar<int64_t, rcpp_T::i64, has_nulls>(element);
   }
@@ -82,8 +82,8 @@ inline auto dispatch_typed(const simdjson::dom::array array,
 
 template <int RTYPE>
 inline auto build_vector_mixed(const simdjson::dom::array array) -> Rcpp::Vector<RTYPE> {
-  auto out = Rcpp::Vector<RTYPE>(std::size(array));
-  auto i = R_xlen_t(0);
+  auto out = Rcpp::Vector<RTYPE>(r_length(array));
+  auto i = R_xlen_t(0L);
   for (auto element : array) {
     out[i++] = get_scalar_dispatch<RTYPE>(element);
   }
@@ -96,7 +96,7 @@ inline auto build_vector_integer64_mixed(const simdjson::dom::array array)
     -> Rcpp::Vector<REALSXP> {
 
   auto stl_vec_int64 = std::vector<int64_t>(std::size(array));
-  auto i = std::size_t(0);
+  auto i = std::size_t(0ULL);
   for (auto element : array) {
     switch (element.type()) {
       case simdjson::dom::element_type::INT64:
@@ -151,7 +151,7 @@ inline auto dispatch_mixed(const simdjson::dom::array array, const rcpp_T common
       return build_vector_mixed<STRSXP>(array);
 
     default:
-      return Rcpp::LogicalVector(std::size(array), NA_LOGICAL);
+      return Rcpp::LogicalVector(r_length(array), NA_LOGICAL);
   }
 }
 

From 7b518d05b690e1cc38d2909179eafc006d56ce9e Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 15:40:49 -0700
Subject: [PATCH 05/16] remove template specification for Rcpp::wrap()

---
 inst/include/RcppSimdJson/deserialize/simplify.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index ebcc4b2..8dcca0c 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -178,13 +178,13 @@ inline auto simplify_element(const simdjson::dom::element element,
       );
 
     case simdjson::dom::element_type::DOUBLE:
-      return Rcpp::wrap<double>(element.get<double>().first);
+      return Rcpp::wrap(element.get<double>().first);
 
     case simdjson::dom::element_type::INT64:
       return utils::resolve_int64<int64_opt>(element.get<int64_t>().first);
 
     case simdjson::dom::element_type::BOOL:
-      return Rcpp::wrap<bool>(element.get<bool>().first);
+      return Rcpp::wrap(element.get<bool>().first);
 
     case simdjson::dom::element_type::STRING:
       return Rcpp::wrap(element.get<const char*>().first);

From dbf9ca4bd1a5d635a3a0293e86c691d7df68a5c5 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 15:43:31 -0700
Subject: [PATCH 06/16] remove const qualifiers on empty_array/object

---
 .../RcppSimdJson/deserialize/simplify.hpp     | 33 ++++++++-----------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index 8dcca0c..4458e01 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -14,9 +14,8 @@ namespace deserialize {
 
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_list(const simdjson::dom::array array,
-                          const SEXP empty_array,
-                          const SEXP empty_object) -> SEXP {
+inline auto simplify_list(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object)
+    -> SEXP {
   Rcpp::List out(r_length(array));
 
   auto i = R_xlen_t(0);
@@ -33,9 +32,8 @@ inline auto simplify_list(const simdjson::dom::array array,
 
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_vector(const simdjson::dom::array array,
-                            const SEXP empty_array,
-                            const SEXP empty_object) -> SEXP {
+inline auto simplify_vector(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object)
+    -> SEXP {
   const auto type_doctor = Type_Doctor<type_policy>(array);
 
   if (type_doctor.is_vectorizable()) {
@@ -53,9 +51,8 @@ inline auto simplify_vector(const simdjson::dom::array array,
 
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_matrix(const simdjson::dom::array array,
-                            const SEXP empty_array,
-                            const SEXP empty_object) -> SEXP {
+inline auto simplify_matrix(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object)
+    -> SEXP {
   if (const auto matrix = matrix::diagnose<type_policy>(array)) {
     return matrix->is_homogeneous
                ? matrix::dispatch_typed<int64_opt>( //
@@ -73,9 +70,8 @@ inline auto simplify_matrix(const simdjson::dom::array array,
 
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_data_frame(const simdjson::dom::array array,
-                                const SEXP empty_array,
-                                const SEXP empty_object) -> SEXP {
+inline auto
+simplify_data_frame(const simdjson::dom::array array, SEXP empty_array, SEXP empty_object) -> SEXP {
   if (const auto cols = diagnose_data_frame<type_policy>(array)) {
     return build_data_frame<type_policy, int64_opt, simplify_to>( //
         array,                                                    //
@@ -91,8 +87,8 @@ inline auto simplify_data_frame(const simdjson::dom::array array,
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto dispatch_simplify_array(const simdjson::dom::array array,
-                                    const SEXP empty_array,
-                                    const SEXP empty_object) -> SEXP {
+                                    SEXP empty_array,
+                                    SEXP empty_object) -> SEXP {
 
   if (std::size(array) == 0) {
     return empty_array;
@@ -133,9 +129,8 @@ inline auto dispatch_simplify_array(const simdjson::dom::array array,
 
 
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
-inline auto simplify_object(const simdjson::dom::object object,
-                            const SEXP empty_array,
-                            const SEXP empty_object) -> SEXP {
+inline auto simplify_object(const simdjson::dom::object object, SEXP empty_array, SEXP empty_object)
+    -> SEXP {
   const auto n = r_length(object);
   if (n == 0) {
     return empty_object;
@@ -159,8 +154,8 @@ inline auto simplify_object(const simdjson::dom::object object,
 // definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_element(const simdjson::dom::element element,
-                             const SEXP empty_array,
-                             const SEXP empty_object) -> SEXP {
+                             SEXP empty_array,
+                             SEXP empty_object) -> SEXP {
 
   switch (element.type()) {
     case simdjson::dom::element_type::ARRAY:

From 918e4f9a78c413e9b00079eb1312bdc3d292fc1d Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 15:46:06 -0700
Subject: [PATCH 07/16] clean dead comments, fix formatting

---
 inst/include/RcppSimdJson/deserialize.hpp     |  3 +
 .../RcppSimdJson/deserialize/Type_Doctor.hpp  | 12 +---
 .../RcppSimdJson/deserialize/dataframe.hpp    |  1 +
 .../RcppSimdJson/deserialize/matrix.hpp       | 62 +------------------
 .../RcppSimdJson/deserialize/simplify.hpp     |  1 +
 5 files changed, 11 insertions(+), 68 deletions(-)

diff --git a/inst/include/RcppSimdJson/deserialize.hpp b/inst/include/RcppSimdJson/deserialize.hpp
index edcd970..3e7b204 100644
--- a/inst/include/RcppSimdJson/deserialize.hpp
+++ b/inst/include/RcppSimdJson/deserialize.hpp
@@ -1,8 +1,10 @@
 #ifndef RCPPSIMDJSON__DESERIALIZE_HPP
 #define RCPPSIMDJSON__DESERIALIZE_HPP
 
+
 #include "deserialize/simplify.hpp"
 
+
 namespace rcppsimdjson {
 namespace deserialize {
 
@@ -248,4 +250,5 @@ inline auto deserialize(const simdjson::dom::element parsed,
 } // namespace deserialize
 } // namespace rcppsimdjson
 
+
 #endif
diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
index c692953..f3909d5 100644
--- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
+++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
@@ -8,13 +8,6 @@ namespace rcppsimdjson {
 namespace deserialize {
 
 
-// enum class Type_Policy : int {
-//   anything_goes = 0,
-//   ints_as_dbls = 1,
-//   strict = 2,
-// };
-
-
 template <Type_Policy type_policy> class Type_Doctor {
   bool ARRAY_ = false;
   bool array_ = false;
@@ -41,6 +34,7 @@ template <Type_Policy type_policy> class Type_Doctor {
   bool UINT64_ = false;
   bool u64_ = false;
 
+
 public:
   Type_Doctor() = default;
   explicit Type_Doctor<type_policy>(simdjson::dom::array) noexcept;
@@ -99,7 +93,7 @@ template <Type_Policy type_policy> class Type_Doctor {
 
   auto add_element(simdjson::dom::element) noexcept -> void;
 
-  constexpr auto update(Type_Doctor<type_policy>&& type_doctor) noexcept -> void;
+  constexpr auto update(Type_Doctor<type_policy>&&) noexcept -> void;
 };
 
 
@@ -327,7 +321,7 @@ template <Type_Policy type_policy>
 inline constexpr auto Type_Doctor<type_policy>::common_element_type() const noexcept
     -> simdjson::dom::element_type {
 
-  using namespace simdjson::dom;
+  using simdjson::dom::element_type;
 
   return ARRAY_ ? element_type::ARRAY
                 : OBJECT_ ? element_type::OBJECT
diff --git a/inst/include/RcppSimdJson/deserialize/dataframe.hpp b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
index 9f897e5..0efa7c4 100644
--- a/inst/include/RcppSimdJson/deserialize/dataframe.hpp
+++ b/inst/include/RcppSimdJson/deserialize/dataframe.hpp
@@ -1,6 +1,7 @@
 #ifndef RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP
 #define RCPPSIMDJSON__DESERIALIZE__DATAFRAME_HPP
 
+
 #include "matrix.hpp"
 
 
diff --git a/inst/include/RcppSimdJson/deserialize/matrix.hpp b/inst/include/RcppSimdJson/deserialize/matrix.hpp
index a08995c..edd173f 100644
--- a/inst/include/RcppSimdJson/deserialize/matrix.hpp
+++ b/inst/include/RcppSimdJson/deserialize/matrix.hpp
@@ -54,16 +54,6 @@ inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols)
   auto out = Rcpp::Matrix<RTYPE>(n_rows, static_cast<R_xlen_t>(n_cols));
   auto j = R_xlen_t(0);
 
-#if RCPPSIMDJSON_EXCEPTIONS
-  for (simdjson::dom::array sub_array : array) {
-    auto i = R_xlen_t(0);
-    for (auto element : sub_array) {
-      out[i + j] = get_scalar<in_T, R_Type, has_nulls>(element);
-      i += n_rows;
-    }
-    j++;
-  }
-#else
   for (auto sub_array : array) {
     auto i = R_xlen_t(0);
     for (auto element : sub_array.get<simdjson::dom::array>().first) {
@@ -72,29 +62,18 @@ inline auto build_matrix_typed(simdjson::dom::array array, std::size_t n_cols)
     }
     j++;
   }
-#endif
 
   return out;
 }
 
 template <bool has_nulls>
-inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std::size_t n_cols)
+inline auto build_matrix_integer64_typed(simdjson::dom::array array, std::size_t n_cols)
     -> Rcpp::Vector<REALSXP> {
 
   const auto n_rows = std::size(array);
   auto stl_vec_int64 = std::vector<int64_t>(n_rows * n_cols);
   auto j = std::size_t(0);
 
-#if RCPPSIMDJSON_EXCEPTIONS
-  for (simdjson::dom::array sub_array : array) {
-    auto i = std::size_t(0);
-    for (auto element : sub_array) {
-      stl_vec_int64[i + j] = get_scalar<int64_t, rcpp_T::i64, has_nulls>(element);
-      i += n_rows;
-    }
-    j++;
-  }
-#else
   for (auto sub_array : array) {
     auto i = std::size_t(0);
     for (auto element : sub_array.get<simdjson::dom::array>().first) {
@@ -103,7 +82,6 @@ inline auto build_matrix_integer64_typed(simdjson::dom::array array, const std::
     }
     j++;
   }
-#endif
 
   auto out = Rcpp::NumericVector(utils::as_integer64(stl_vec_int64));
   out.attr("dim") = Rcpp::IntegerVector::create(static_cast<R_xlen_t>(n_rows), //
@@ -169,26 +147,16 @@ inline auto dispatch_typed(const simdjson::dom::array array,
     default:
       return R_NilValue;
   }
-} // namespace deserialize
+}
 
 template <int RTYPE>
-inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size_t n_cols) -> SEXP {
+inline auto build_matrix_mixed(simdjson::dom::array array, std::size_t n_cols) -> SEXP {
 
   const auto n_rows = r_length(array);
   Rcpp::Matrix<RTYPE> out(n_rows, static_cast<R_xlen_t>(n_cols));
 
   auto j = R_xlen_t(0);
 
-#if RCPPSIMDJSON_EXCEPTIONS
-  for (simdjson::dom::array sub_array : array) {
-    auto i = R_xlen_t(0);
-    for (auto element : sub_array) {
-      out[i + j] = get_scalar_dispatch<RTYPE>(element);
-      i += n_rows;
-    }
-    j++;
-  }
-#else
   for (auto sub_array : array) {
     auto i = R_xlen_t(0);
     for (auto element : sub_array.get<simdjson::dom::array>().first) {
@@ -197,7 +165,6 @@ inline auto build_matrix_mixed(const simdjson::dom::array array, const std::size
     }
     j++;
   }
-#endif
 
   return out;
 }
@@ -210,27 +177,6 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std::
   auto stl_vec_int64 = std::vector<int64_t>(n_rows * n_cols);
   auto j = std::size_t(0ULL);
 
-#if RCPPSIMDJSON_EXCEPTIONS
-  for (simdjson::dom::array sub_array : array) {
-    std::size_t i = 0;
-    for (auto element : sub_array) {
-      switch (element.type()) {
-        case simdjson::dom::element_type::INT64:
-          stl_vec_int64[i + j] = get_scalar<int64_t, rcpp_T::i64, NO_NULLS>(element);
-          break;
-
-        case simdjson::dom::element_type::BOOL:
-          stl_vec_int64[i + j] = get_scalar<bool, rcpp_T::i64, NO_NULLS>(element);
-          break;
-
-        default:
-          stl_vec_int64[i + j] = NA_INTEGER64;
-      }
-      i += n_rows;
-    }
-    j++;
-  }
-#else
   for (auto element : array) {
     std::size_t i = 0;
     for (auto sub_element : element.get<simdjson::dom::array>().first) {
@@ -250,8 +196,6 @@ inline auto build_matrix_integer64_mixed(const simdjson::dom::array array, std::
     }
     j++;
   }
-#endif
-
 
   auto out = Rcpp::Vector<REALSXP>(utils::as_integer64(stl_vec_int64));
   out.attr("dim") = Rcpp::IntegerVector::create(static_cast<R_xlen_t>(n_rows), //
diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index 4458e01..e69eb23 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -1,6 +1,7 @@
 #ifndef RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP
 #define RCPPSIMDJSON__DESERIALIZE__SIMPLIFY_HPP
 
+
 #include "../common.hpp"
 #include "Type_Doctor.hpp"
 #include "scalar.hpp"

From 346d5840aeb06cc2e655984f3c0a1867520300a6 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 15:46:36 -0700
Subject: [PATCH 08/16] add more documentation

---
 inst/include/RcppSimdJson/common.hpp          | 98 +++++++++++--------
 inst/include/RcppSimdJson/deserialize.hpp     | 27 ++++-
 .../RcppSimdJson/deserialize/scalar.hpp       | 10 ++
 .../RcppSimdJson/deserialize/simplify.hpp     | 25 ++++-
 4 files changed, 115 insertions(+), 45 deletions(-)

diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp
index 71a9d86..a882d7b 100644
--- a/inst/include/RcppSimdJson/common.hpp
+++ b/inst/include/RcppSimdJson/common.hpp
@@ -8,30 +8,39 @@
 
 namespace rcppsimdjson {
 
-/*
- * `bit64::integer64`-compatible `NA`
+/**
+ * @brief A container's size as an @c R_xlen_t @c. Otherwise Equivalent to @c std::size() @c.
+ */
+template <typename _Container>
+inline constexpr auto r_length(const _Container& __cont) noexcept -> R_xlen_t {
+  return static_cast<R_xlen_t>(std::size(__cont));
+}
+
+
+/**
+ * @brief A @c bit64::integer64 @c-compatible @c NA @c.
  */
 static inline constexpr int64_t NA_INTEGER64 = LLONG_MIN;
 
 
-/*
- * Typing arguments that decide how `simdjson::dom::element`s are ultimate return to R.
+/**
+ * @brief Typing arguments that decide how a @c simdjson::dom::element is ultimately returned to R.
  */
 enum class rcpp_T : int {
-  array = 0,  /* recursive: individual elements will decide ultimate R type */
-  object = 1, /* recursive: individual elements will decide ultimate R type */
-  chr = 2,    /* always becomes `Rcpp::String`/`character(1L)` */
-  u64 = 3,    /* always becomes `Rcpp::String`/`character(1L)` */
-  dbl = 4,    /* always becomes `double` */
-  i64 = 5,    /* follows Int64_R_Type: `double`, `character(1L)`, or `bit64::integer64` */
-  i32 = 6,    /* always becomes `int` */
-  lgl = 7,    /* always becomes `bool */
-  null = 8,   /* becomes `NA` if returned in a vector, else `NULL */
+  array = 0,  /**< recursive: individual elements will decide ultimate R type */
+  object = 1, /**< recursive: individual elements will decide ultimate R type */
+  chr = 2,    /**< always becomes @c Rcpp::String / @c character */
+  u64 = 3,    /**< always becomes @c Rcpp::String / @c character */
+  dbl = 4,    /**< always becomes @c double */
+  i64 = 5,    /**< follows @c Int64_R_Type: @c double, @c character, or @c bit64::integer64 */
+  i32 = 6,    /**< always becomes @c int */
+  lgl = 7,    /**< always becomes @c bool / @c logical */
+  null = 8,   /**< becomes @c NA if returned in a vector, else @c NULL */
 };
 
 
-/*
- * Generic, typed `NA` inserter.
+/**
+ * @brief Get a typed @c NA @c.
  */
 template <rcpp_T R_Type> static inline constexpr auto na_val() {
   if constexpr (R_Type == rcpp_T::chr) {
@@ -52,28 +61,34 @@ template <rcpp_T R_Type> static inline constexpr auto na_val() {
 }
 
 
-/*
+/**
  * Internal flags tracking whether simdjson is compiled with exceptions enabled (the default).
- * If simdjson is compiled w/o exceptions (`#define SIMDJSON_EXCEPTIONS 0`), operations that do not
- * touch throwing code can be annotated with keyword `noexcept` where appropriate.
- * See inst/include/RcppSimdJson/deserialize/scalar.hpp for examples.
+ * If simdjson is compiled w/o exceptions ( @c #define SIMDJSON_EXCEPTIONS 0 @c), operations that
+ * do not touch throwing code can be annotated with keyword @c noexcept where appropriate.
  */
 // #define SIMDJSON_EXCEPTIONS 0 /* uncomment to disable compiling simdjson w/ exceptions */
 #ifdef SIMDJSON_EXCEPTIONS
 #define RCPPSIMDJSON_EXCEPTIONS SIMDJSON_EXCEPTIONS
 static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = SIMDJSON_EXCEPTIONS != 1;
 #else
-#define RCPPSIMDJSON_EXCEPTIONS 1 // NOLINT(cppcoreguidelines-macro-usage)
+#define RCPPSIMDJSON_EXCEPTIONS 1
 static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
 #endif
 
 
-/*
- * All scalar-getter functions are annotated with `is_no_except()`, which will be false if
- * `RCPPSIMDJSON_NO_EXCEPTIONS` is enabled and their `rcpp_T` template argument is not
- * `rcpp_T::chr` (strings are not currently extractable w/o touching throwing code).
+/**
+ * @brief Whether a function is @code{noexcept}.
+ *
+ * If a function does not touch throwing code it can be annotated as @c noexcept().
+ * If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not
+ * @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept
+ * when compiled. Currently, @c rccp_T::chr touches throwing code so functions using it will always
+ * be @c noexcept(false).
+ *
+ * Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}.
  */
-static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool {
+static inline constexpr auto is_no_except(rcpp_T R_Type) // NOLINT(clang-diagnostic-unused-function)
+    -> bool {
   return RCPPSIMDJSON_NO_EXCEPTIONS && R_Type != rcpp_T::chr;
 }
 
@@ -81,30 +96,27 @@ static inline constexpr auto is_no_except(rcpp_T R_Type) noexcept -> bool {
 namespace deserialize {
 
 
-static inline constexpr bool HAS_NULLS = true;
-static inline constexpr bool NO_NULLS = false;
-
-/*
- * Determines level of type strictness in combining array elements into R vectors.
+/**
+ * @brief Determines level of type strictness in combining array elements into R vectors.
  *
- * When arrays are not homogeneous and `Type_Policy::anything_goes` is used, type promotion follows
+ * When arrays are not homogeneous and @c Type_Policy::anything_goes is used, type promotion follows
  * R's behavior.
  */
 enum class Type_Policy : int {
-  anything_goes = 0, /* Non-recursive arrays always become vectors */
-  ints_as_dbls = 1,  /* Combines `rcpp_T::i32`s, `::i64`s, and `::dbl`s */
-  strict = 2,        /* No type promotions */
+  anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */
+  ints_as_dbls = 1,  /*  Non-recursive arrays of only numbers are promoted to highest type */
+  strict = 2,        /* No type promotion */
 };
 
 
-/*
- * Maximum simplification level. `Simplify_To::list` results in no simplification.
+/**
+ * @brief Maximum simplification level.
  */
 enum class Simplify_To : int {
-  data_frame = 0,
-  matrix = 1,
-  vector = 2,
-  list = 3,
+  data_frame = 0, /* If possible, return dataframes. Otherwise return matrices/vectors/lists. */
+  matrix = 1,     /* If possible, return matrices. Otherwise return vectors/lists. */
+  vector = 2,     /* If possible, return vectors. Otherwise return lists. */
+  list = 3,       /* No simplification. */
 };
 
 
@@ -122,7 +134,11 @@ namespace rcppsimdjson {
 namespace deserialize {
 
 
-// forward declaration: definition in inst/include/RcppSimdJson/deserialize/simplify.hpp
+/**
+ * @brief Simplify a @c simdjson::dom::element to an R object.
+ *
+ * @note Forward declaration. See @file inst/include/RcppSimdJson/deserialize/simplify.hpp @file.
+ */
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_element(simdjson::dom::element element, SEXP empty_array, SEXP empty_object)
     -> SEXP;
diff --git a/inst/include/RcppSimdJson/deserialize.hpp b/inst/include/RcppSimdJson/deserialize.hpp
index 3e7b204..c990bf9 100644
--- a/inst/include/RcppSimdJson/deserialize.hpp
+++ b/inst/include/RcppSimdJson/deserialize.hpp
@@ -8,15 +8,36 @@
 namespace rcppsimdjson {
 namespace deserialize {
 
-// THE GREAT DISPATCHER
+
+/**
+ * @brief Deserialize a parsed @c simdjson::dom::element to R objects.
+ *
+ *
+ * @param element @c simdjson::dom::element to deserialize.
+ *
+ * @param empty_array R object to return when encountering an empty JSON array.
+ *
+ * @param empty_object R object to return when encountering an empty JSON object.
+ *
+ * @param type_policy @c Type_Policy specifying type strictness in combining mixed-type array
+ * elements into R vectors.
+ *
+ * @param int64_opt @c Int64_R_Type specifying how big integers are returned to R.
+ *
+ * @param simplify_to @c Simplify_To specifying the maximum level of simplification.
+ *
+ *
+ * @return The simplified R object ( @c SEXP ).
+ */
 inline auto deserialize(const simdjson::dom::element parsed,
-                        const SEXP empty_array,
-                        const SEXP empty_object,
+                        SEXP empty_array,
+                        SEXP empty_object,
                         const Simplify_To simplify_to,
                         const Type_Policy type_policy,
                         const utils::Int64_R_Type int64_opt) -> SEXP {
   using Int64_R_Type = utils::Int64_R_Type;
 
+  // THE GREAT DISPATCHER
   switch (type_policy) {
     case Type_Policy::anything_goes: {
       switch (int64_opt) {
diff --git a/inst/include/RcppSimdJson/deserialize/scalar.hpp b/inst/include/RcppSimdJson/deserialize/scalar.hpp
index 78acfdf..d76c04e 100644
--- a/inst/include/RcppSimdJson/deserialize/scalar.hpp
+++ b/inst/include/RcppSimdJson/deserialize/scalar.hpp
@@ -7,6 +7,16 @@ namespace rcppsimdjson {
 namespace deserialize {
 
 
+/*
+ * Check for `null`s and return the appropriate `NA`s when found.
+ */
+static inline constexpr bool HAS_NULLS = true;
+/*
+ * No `null`s present, so skip checking for them.
+ */
+static inline constexpr bool NO_NULLS = false;
+
+
 template <typename in_T, rcpp_T R_Type>
 inline auto get_scalar_(simdjson::dom::element) noexcept(is_no_except(R_Type));
 
diff --git a/inst/include/RcppSimdJson/deserialize/simplify.hpp b/inst/include/RcppSimdJson/deserialize/simplify.hpp
index e69eb23..3107c64 100644
--- a/inst/include/RcppSimdJson/deserialize/simplify.hpp
+++ b/inst/include/RcppSimdJson/deserialize/simplify.hpp
@@ -152,7 +152,30 @@ inline auto simplify_object(const simdjson::dom::object object, SEXP empty_array
 }
 
 
-// definition: forward declaration in inst/include/RcppSimdJson/deserialize/simplify.hpp
+/**
+ * @brief Simplify a @c simdjson::dom::element to an R object.
+ *
+ *
+ * @tparam type_policy The @c Type_Policy specifying type strictness in combining mixed-type array
+ * elements into R vectors.
+ *
+ * @tparam int64_opt The @c Int64_R_Type specifying how big integers are returned to R.
+ *
+ * @tparam simplify_to The @c Simplify_To specifying the maximum level of simplification.
+ *
+ *
+ * @param element @c simdjson::dom::element to simplify.
+ *
+ * @param empty_array R object to return when encountering an empty JSON array.
+ *
+ * @param empty_object R object to return when encountering an empty JSON object.
+ *
+ *
+ * @return The simplified R object ( @c SEXP ).
+ *
+ *
+ * @note definition: forward declaration in @file inst/include/RcppSimdJson/common.hpp @file.
+ */
 template <Type_Policy type_policy, utils::Int64_R_Type int64_opt, Simplify_To simplify_to>
 inline auto simplify_element(const simdjson::dom::element element,
                              SEXP empty_array,

From 52a0f72584f3a69ba6f296f4031236046190e2b9 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 18:43:06 -0700
Subject: [PATCH 09/16] small documentation fixes

---
 inst/include/RcppSimdJson/common.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp
index a882d7b..1261d36 100644
--- a/inst/include/RcppSimdJson/common.hpp
+++ b/inst/include/RcppSimdJson/common.hpp
@@ -77,13 +77,15 @@ static inline constexpr auto RCPPSIMDJSON_NO_EXCEPTIONS = false;
 
 
 /**
- * @brief Whether a function is @code{noexcept}.
+ * @brief Whether a function is @c noexcept.
  *
- * If a function does not touch throwing code it can be annotated as @c noexcept().
+ * If a function does not touch throwing code it can be annotated with @c noexcept().
  * If @c RCPPSIMDJSON_NO_EXCEPTIONS is enabled and the @c rcpp_T template argument is not
  * @c rcpp_T::chr, functions annotated with @c noexcept(is_no_except(rcpp_T)) will be @c noexcept
- * when compiled. Currently, @c rccp_T::chr touches throwing code so functions using it will always
- * be @c noexcept(false).
+ * when compiled.
+ *
+ * Currently, @c rccp_T::chr touches throwing code so functions using it will always be
+ * @c noexcept(false).
  *
  * Many examples in @file{inst/include/RcppSimdJson/deserialize/scalar.hpp}.
  */
@@ -104,7 +106,7 @@ namespace deserialize {
  */
 enum class Type_Policy : int {
   anything_goes = 0, /* Non-recursive arrays always become vectors of the highest present type */
-  ints_as_dbls = 1,  /*  Non-recursive arrays of only numbers are promoted to highest type */
+  ints_as_dbls = 1,  /* Non-recursive arrays of only numbers are promoted to highest type */
   strict = 2,        /* No type promotion */
 };
 
@@ -121,8 +123,6 @@ enum class Simplify_To : int {
 
 
 } // namespace deserialize
-
-
 } // namespace rcppsimdjson
 
 

From f4d73e2daaac7c8c272b1ad8babc547c03b71bd0 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 18:45:48 -0700
Subject: [PATCH 10/16] fix .deserialize_json() when exceptions are disabled;
 add .load_json() file reader

---
 src/deserialize.cpp | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/deserialize.cpp b/src/deserialize.cpp
index 6d81721..8d10fe0 100644
--- a/src/deserialize.cpp
+++ b/src/deserialize.cpp
@@ -1,5 +1,6 @@
 #include <RcppSimdJson.hpp>
 
+
 //' Deserialize JSON into R Objects
 //'
 //' @param json \code{character(1L)}
@@ -33,18 +34,42 @@ SEXP deserialize_json(const Rcpp::String& json,
 
   simdjson::dom::parser parser;
 
-#if RCPPSIMDJSON_EXCEPTIONS
-  simdjson::dom::element parsed = json_pointer.empty() //
-                                      ? parser.parse(json)
-                                      : parser.parse(json).at(json_pointer);
-#else
   auto [parsed, error] = json_pointer.empty() //
-                             ? parser.parse(json).first
+                             ? parser.parse(json)
                              : parser.parse(json).at(json_pointer);
+
+  if (error) {
+    Rcpp::stop(simdjson::error_message(error));
+  }
+
+  return deserialize::deserialize(parsed,
+                                  empty_array,
+                                  empty_object,
+                                  static_cast<deserialize::Simplify_To>(simplify_to),
+                                  static_cast<deserialize::Type_Policy>(type_policy),
+                                  static_cast<utils::Int64_R_Type>(int64_r_type));
+}
+
+
+// [[Rcpp::export(.load_json)]]
+SEXP load_json(const std::string& file_path,
+               const std::string& json_pointer = "",
+               SEXP empty_array = R_NilValue,
+               SEXP empty_object = R_NilValue,
+               const int simplify_to = 0,
+               const int type_policy = 0,
+               const int int64_r_type = 0) {
+  using namespace rcppsimdjson;
+
+  simdjson::dom::parser parser;
+
+  auto [parsed, error] = json_pointer.empty() //
+                             ? parser.load(file_path)
+                             : parser.load(file_path).at(json_pointer);
+
   if (error) {
     Rcpp::stop(simdjson::error_message(error));
   }
-#endif
 
   return deserialize::deserialize(parsed,
                                   empty_array,

From 50982cb1314a1d80f4d6dda17fbf77e42ff559af Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 18:48:04 -0700
Subject: [PATCH 11/16] sync with upstream simdjson (12 Jun 2020)

---
 inst/include/simdjson.cpp | 5855 ++++++++++++++++++++-----------------
 inst/include/simdjson.h   |  829 +++---
 2 files changed, 3581 insertions(+), 3103 deletions(-)

diff --git a/inst/include/simdjson.cpp b/inst/include/simdjson.cpp
index a2d815f..d99dc8b 100644
--- a/inst/include/simdjson.cpp
+++ b/inst/include/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
+/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */
 /* begin file src/simdjson.cpp */
 #include "simdjson.h"
 
@@ -12,7 +12,6 @@ namespace internal {
 
   SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
     { SUCCESS, "No error" },
-    { SUCCESS_AND_HAS_MORE, "No error and buffer still has more data" },
     { CAPACITY, "This parser can't support a document that big" },
     { MEMALLOC, "Error allocating memory, we're most likely out of memory" },
     { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
@@ -359,8 +358,6 @@ static const uint64_t thintable_epi8[256] = {
 namespace simdjson {
 namespace haswell {
 
-using namespace simdjson::dom;
-
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation(
@@ -368,11 +365,12 @@ class implementation final : public simdjson::implementation {
       "Intel/AMD AVX2",
       instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
   ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace haswell
@@ -398,11 +396,12 @@ using namespace simdjson::dom;
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace westmere
@@ -428,11 +427,12 @@ using namespace simdjson::dom;
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace arm64
@@ -462,11 +462,12 @@ class implementation final : public simdjson::implementation {
       "Generic fallback implementation",
       0
   ) {}
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace fallback
@@ -489,21 +490,16 @@ class detect_best_supported_implementation_on_first_use final : public implement
   const std::string &name() const noexcept final { return set_best()->name(); }
   const std::string &description() const noexcept final { return set_best()->description(); }
   uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final {
-    return set_best()->parse(buf, len, parser);
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_length,
+    std::unique_ptr<internal::dom_parser_implementation>& dst
+  ) const noexcept final {
+    return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
   }
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
     return set_best()->minify(buf, len, dst, dst_len);
   }
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept final {
-    return set_best()->stage1(buf, len, parser, streaming);
-  }
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final {
-    return set_best()->stage2(buf, len, parser);
-  }
-  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept final {
-    return set_best()->stage2(buf, len, parser, next_json);
-  }
 
   really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
 private:
@@ -532,21 +528,16 @@ const std::initializer_list<const implementation *> available_implementation_poi
 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
 class unsupported_implementation final : public implementation {
 public:
-  WARN_UNUSED error_code parse(const uint8_t *, size_t, dom::parser &) const noexcept final {
+  WARN_UNUSED error_code create_dom_parser_implementation(
+    size_t,
+    size_t,
+    std::unique_ptr<internal::dom_parser_implementation>&
+  ) const noexcept final {
     return UNSUPPORTED_ARCHITECTURE;
   }
   WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
     return UNSUPPORTED_ARCHITECTURE;
   }
-  WARN_UNUSED error_code stage1(const uint8_t *, size_t, dom::parser &, bool) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
-  }
-  WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
-  }
-  WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &, size_t &) const noexcept final {
-    return UNSUPPORTED_ARCHITECTURE;
-  }
 
   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
 };
@@ -1942,7 +1933,151 @@ const uint64_t mantissa_128[] = {
 /* simdprune_tables.h already included: #include "simdprune_tables.h" */
 
 #if SIMDJSON_IMPLEMENTATION_ARM64
-/* begin file src/arm64/stage1.cpp */
+/* begin file src/arm64/implementation.cpp */
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
+/* begin file src/arm64/dom_parser_implementation.h */
+#ifndef SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
+
+/* isadetection.h already included: #include "isadetection.h" */
+
+namespace simdjson {
+namespace arm64 {
+
+/* begin file src/generic/dom_parser_implementation.h */
+// expectation: sizeof(scope_descriptor) = 64/8.
+struct scope_descriptor {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+typedef void* ret_address_t;
+#else
+typedef char ret_address_t;
+#endif
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<scope_descriptor[]> containing_scope{};
+  /** Return address of each open { or [ */
+  std::unique_ptr<ret_address_t[]> ret_address{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+  error_code error{UNINITIALIZED};
+
+  really_inline dom_parser_implementation();
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
+
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+};
+
+/* begin file src/generic/stage1/allocate.h */
+namespace stage1 {
+namespace allocate {
+
+//
+// Allocates stage 1 internal state and outputs in the parser
+//
+really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
+  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!parser.structural_indexes) { return MEMALLOC; }
+  parser.structural_indexes[0] = 0;
+  parser.n_structural_indexes = 0;
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage1
+/* end file src/generic/stage1/allocate.h */
+/* begin file src/generic/stage2/allocate.h */
+namespace stage2 {
+namespace allocate {
+
+//
+// Allocates stage 2 internal state and outputs in the parser
+//
+really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
+  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+  if (!parser.ret_address || !parser.containing_scope) {
+    return MEMALLOC;
+  }
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage2
+/* end file src/generic/stage2/allocate.h */
+
+really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+// Leaving these here so they can be inlined if so desired
+WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  error_code err = stage1::allocate::set_capacity(*this, capacity);
+  if (err) { _capacity = 0; return err; }
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+  if (err) { _max_depth = 0; return err; }
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+/* end file src/generic/stage2/allocate.h */
+
+} // namespace arm64
+} // namespace simdjson
+
+#endif // SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
+
+TARGET_HASWELL
+
+namespace simdjson {
+namespace arm64 {
+
+WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  dst->set_capacity(capacity);
+  dst->set_max_depth(max_depth);
+  return SUCCESS;
+}
+
+} // namespace arm64
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/arm64/dom_parser_implementation.cpp */
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
+/* arm64/dom_parser_implementation.h already included: #include "arm64/dom_parser_implementation.h" */
+
+//
+// Stage 1
+//
 /* begin file src/arm64/bitmask.h */
 #ifndef SIMDJSON_ARM64_BITMASK_H
 #define SIMDJSON_ARM64_BITMASK_H
@@ -2594,7 +2729,6 @@ really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_
 #endif // SIMDJSON_ARM64_SIMD_H
 /* end file src/arm64/bitmanipulation.h */
 /* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
 
 namespace simdjson {
 namespace arm64 {
@@ -2665,24 +2799,21 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  really_inline size_t block_index();
+  really_inline bool has_full_block() const;
+  really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  really_inline size_t get_remainder(uint8_t *dst) const;
+  really_inline void advance();
 private:
   const uint8_t *buf;
   const size_t len;
@@ -2690,6 +2821,18 @@ struct buf_block_reader {
   size_t idx;
 };
 
+constexpr const int TITLE_SIZE = 12;
+
+// Routines to print masks and text for debugging bitmask operations
+UNUSED static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -2709,6 +2852,34 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
+
+template<size_t STEP_SIZE>
+really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -3008,13 +3179,15 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  if (likely(reader.get_remainder(block)) > 0) {
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -3027,6 +3200,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
+/* begin file src/generic/stage1/find_next_document_index.h */
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // TODO don't count separately, just figure out depth
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  return 0;
+}
+
+// Skip the last character if it is partial
+really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -3077,9 +3338,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
+//
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
+//
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -3094,6 +3355,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 using namespace simd;
 
 namespace utf8_validation {
+  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -3138,7 +3400,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
+      0, 0, 0, 0,
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -3169,214 +3431,6 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -3514,16 +3568,22 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes)
-  : indexer{structural_indexes} {}
+  really_inline json_structural_indexer(uint32_t *structural_indexes);
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -3532,42 +3592,44 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
+really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
+  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+  if (partial) { len = trim_partial_utf8(buf, len); }
 
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
+  // Take care of the last block (will always be there unless file is empty)
+  uint8_t block[STEP_SIZE];
+  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
+  indexer.step<STEP_SIZE>(block, reader);
+
+  return indexer.finish(parser, reader.block_index(), len, partial);
 }
 
 template<>
@@ -3589,61 +3651,76 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
   reader.advance();
 }
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-//
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
-  }
+really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+
+  error_code error = scanner.finish(partial);
+  if (unlikely(error != SUCCESS)) { return error; }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
   }
 
-  return indexer.finish(parser, reader.block_index(), len, streaming);
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial) {
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  }
+  return checker.errors();
 }
 
 } // namespace stage1
 /* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
+WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
 }
 
 } // namespace arm64
 } // namespace simdjson
-/* end file src/generic/stage1/json_structural_indexer.h */
-/* begin file src/arm64/stage2.cpp */
-#ifndef SIMDJSON_ARM64_STAGE2_H
-#define SIMDJSON_ARM64_STAGE2_H
 
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
+//
+// Stage 2
+//
+
 /* begin file src/arm64/stringparsing.h */
 #ifndef SIMDJSON_ARM64_STRINGPARSING_H
 #define SIMDJSON_ARM64_STRINGPARSING_H
@@ -4049,10 +4126,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -4072,26 +4149,6 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -4169,14 +4226,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -4189,12 +4246,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.write_s64(i);
+      writer.append_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.write_u64(i);
+      writer.append_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -4204,7 +4261,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.write_double(d);
+    writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -4228,10 +4285,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W writer) {
+                                W &writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.write_s64(0);        // always write zero
+  writer.append_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -4251,7 +4308,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -4375,7 +4432,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (digit_count >= 19) {
+      if (unlikely(digit_count >= 19)) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -4383,14 +4440,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        return slow_float_parsing((const char *) src, writer);
+        bool success = slow_float_parsing((const char *) src, writer);
+        // The number was already written, but we made a copy of the writer
+        // when we passed it to the parse_large_integer() function, so 
+        writer.skip_double();
+        return success;
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, writer);
+      bool success = slow_float_parsing((const char *) src, writer);
+      // The number was already written, but we made a copy of the writer when we passed it to the
+      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+      writer.skip_double();
+      return success;
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -4399,7 +4464,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.write_double(d);
+      writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -4414,10 +4479,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, writer, found_minus);
+      bool success = parse_large_integer(src, writer, found_minus);
+      // The number was already written, but we made a copy of the writer
+      // when we passed it to the parse_large_integer() function, so 
+      writer.skip_large_integer();
+      return success;
     }
     i = negative ? 0 - i : i;
-    writer.write_s64(i);
+    writer.append_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -4439,6 +4508,72 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
 namespace simdjson {
 namespace arm64 {
 
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace logger {
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+  static constexpr const bool LOG_ENABLED = false;
+  static constexpr const int LOG_EVENT_LEN = 30;
+  static constexpr const int LOG_BUFFER_LEN = 20;
+  static constexpr const int LOG_DETAIL_LEN = 50;
+  static constexpr const int LOG_INDEX_LEN = 10;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static really_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static really_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
+    }
+  }
+
+  static really_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line of 
+  template<typename S>
+  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(structurals.current()[i]));
+        }
+        printf(" ");
+      }
+      printf("|    %c ", printable_char(structurals.current_char()));
+      printf("|    %c ", printable_char(structurals.peek_next_char()));
+      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
+      printf("| %5u ", structurals.next_tape_index());
+      printf("| %-*s ", LOG_DETAIL_LEN, detail);
+      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+      printf("|\n");
+    }
+  }
+} // namespace logger
+
+/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -4497,26 +4632,34 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf},
-     len{_len},
-     structural_indexes{_structural_indexes},
-     next_structural{next_structural_index}
-    {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+  const uint8_t* const buf;
+  uint32_t *current_structural;
+  dom_parser_implementation &parser;
+
+  // Start a structural 
+  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+    : buf{_parser.buf},
+      current_structural{&_parser.structural_indexes[start_structural_index]},
+      parser{_parser} {
   }
+  // Get the buffer position of the current structural character
+  really_inline const uint8_t* current() {
+    return &buf[*current_structural];
+  }
+  // Get the current structural character
   really_inline char current_char() {
-    return c;
+    return buf[*current_structural];
   }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
+  // Get the next structural character without advancing
+  really_inline char peek_next_char() {
+    return buf[*(current_structural+1)];
+  }
+  really_inline char advance_char() {
+    current_structural++;
+    return buf[*current_structural];
   }
   really_inline size_t remaining_len() {
-    return len - idx;
+    return parser.len - *current_structural;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -4533,32 +4676,25 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
+    memcpy(copy, buf, parser.len);
+    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
+    return current_structural >= &parser.structural_indexes[n_structural_indexes];
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
+    return current_structural == &parser.structural_indexes[n_structural_indexes];
   }
-  really_inline size_t next_structural_index() {
-    return next_structural;
+  really_inline bool at_beginning() {
+    return current_structural == parser.structural_indexes.get();
   }
-
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx{0}; // location of the structural character in the input (buf)
-  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -4570,8 +4706,105 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
+namespace { // Make everything here private
+
+/* begin file src/generic/stage2/tape_writer.h */
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+  
+  /** Write a signed 64-bit value to tape. */
+  really_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  really_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  really_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  really_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  really_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  really_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+really_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+really_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+really_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
 
-using internal::ret_address;
+really_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -4602,102 +4835,88 @@ using internal::ret_address;
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
+  ret_address_t array_begin;
+  ret_address_t array_continue;
+  ret_address_t error;
+  ret_address_t finish;
+  ret_address_t object_begin;
+  ret_address_t object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct number_writer {
-  parser &doc_parser;
-  
-  really_inline void write_s64(int64_t value) noexcept {
-    write_tape(0, internal::tape_type::INT64);
-    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
-    ++doc_parser.current_loc;
-  }
-  really_inline void write_u64(uint64_t value) noexcept {
-    write_tape(0, internal::tape_type::UINT64);
-    doc_parser.doc.tape[doc_parser.current_loc++] = value;
-  }
-  really_inline void write_double(double value) noexcept {
-    write_tape(0, internal::tape_type::DOUBLE);
-    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
-    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
-    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
-  }
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
-  }
-}; // struct number_writer
-
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
+struct structural_parser : structural_iterator {
+  /** Lets you append to the tape */
+  tape_writer tape;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc{};
-  uint32_t depth;
-
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
-
-  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
-    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
-    doc_parser.containing_scope[depth].count = 0;
-    write_tape(0, type); // if the document is correct, this gets rewritten later
-    doc_parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc;
+  /** Current depth (nested objects and arrays) */
+  uint32_t depth{0};
+
+  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+    : structural_iterator(_parser, start_structural_index),
+      tape{parser.doc->tape.get()},
+      current_string_buf_loc{parser.doc->string_buf.get()} {
+  }
+
+  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
+    parser.containing_scope[depth].tape_index = next_tape_index();
+    parser.containing_scope[depth].count = 0;
+    tape.skip(); // We don't actually *write* the start element until the end.
+    parser.ret_address[depth] = continue_state;
     depth++;
-    return depth >= doc_parser.max_depth();
+    bool exceeded_max_depth = depth >= parser.max_depth();
+    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
+    return exceeded_max_depth;
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    return start_scope(internal::tape_type::ROOT, continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
+    log_start_value("document");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_OBJECT, continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
+    log_start_value("object");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_ARRAY, continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
+    log_start_value("array");
+    return start_scope(continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type type) noexcept {
+  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
     depth--;
-    // write our doc.tape location to the header scope
+    // write our doc->tape location to the header scope
     // The root scope gets written *at* the previous location.
-    write_tape(doc_parser.containing_scope[depth].tape_index, type);
+    tape.append(parser.containing_scope[depth].tape_index, end);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
-    const uint32_t count = doc_parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+    const uint32_t count = parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
-    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
+    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+  }
+
+  really_inline uint32_t next_tape_index() {
+    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
   }
 
   really_inline void end_object() {
-    end_scope(internal::tape_type::END_OBJECT);
+    log_end_value("object");
+    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    end_scope(internal::tape_type::END_ARRAY);
+    log_end_value("array");
+    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    end_scope(internal::tape_type::ROOT);
-  }
-
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+    log_end_value("document");
+    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -4705,17 +4924,16 @@ struct structural_parser {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    /* we advance the point, accounting for the fact that we have a NULL
-      * termination         */
-    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
+    // we advance the point, accounting for the fact that we have a NULL termination
+    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline bool on_end_string(uint8_t *dst) noexcept {
+  really_inline void on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -4725,73 +4943,49 @@ struct structural_parser {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
-    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string() {
+  WARN_UNUSED really_inline bool parse_string(bool key = false) {
+    log_value(key ? "key" : "string");
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
+    dst = stringparsing::parse_string(current(), dst);
     if (dst == nullptr) {
+      log_error("Invalid escape in string");
       return true;
     }
-    return !on_end_string(dst);
+    on_end_string(dst);
+    return false;
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    number_writer writer{doc_parser};
-    return !numberparsing::parse_number(src, found_minus, writer);
+    log_value("number");
+    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+    if (!succeeded) { log_error("Invalid number"); }
+    return !succeeded;
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
-
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
+    return parse_number(current(), found_minus);
   }
 
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
-
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
+  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
+    switch (advance_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
+    case 't':
+      log_value("true");
+      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
+      tape.append(0, internal::tape_type::TRUE_VALUE);
+      return continue_state;
+    case 'f':
+      log_value("false");
+      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
+      tape.append(0, internal::tape_type::FALSE_VALUE);
+      return continue_state;
+    case 'n':
+      log_value("null");
+      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
+      tape.append(0, internal::tape_type::NULL_VALUE);
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -4807,40 +5001,27 @@ struct structural_parser {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
+      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
     end_document();
+    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
+
     if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
+      log_error("Unclosed objects or arrays!");
+      return parser.error = TAPE_ERROR;
     }
 
-    return on_success(SUCCESS);
-  }
-
-  really_inline error_code on_error(error_code new_error_code) noexcept {
-    doc_parser.error = new_error_code;
-    return new_error_code;
-  }
-  really_inline error_code on_success(error_code success_code) noexcept {
-    doc_parser.error = success_code;
-    doc_parser.valid = true;
-    return success_code;
+    return SUCCESS;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
+    /* We do not need the next line because this is done by parser.init_stage2(),
     * pessimistically.
-    * doc_parser.is_valid  = false;
+    * parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -4848,12 +5029,12 @@ struct structural_parser {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return on_error(DEPTH_ERROR);
+    if (depth >= parser.max_depth()) {
+      return parser.error = DEPTH_ERROR;
     }
-    switch (structurals.current_char()) {
+    switch (current_char()) {
     case '"':
-      return on_error(STRING_ERROR);
+      return parser.error = STRING_ERROR;
     case '0':
     case '1':
     case '2':
@@ -4865,92 +5046,124 @@ struct structural_parser {
     case '8':
     case '9':
     case '-':
-      return on_error(NUMBER_ERROR);
+      return parser.error = NUMBER_ERROR;
     case 't':
-      return on_error(T_ATOM_ERROR);
+      return parser.error = T_ATOM_ERROR;
     case 'n':
-      return on_error(N_ATOM_ERROR);
+      return parser.error = N_ATOM_ERROR;
     case 'f':
-      return on_error(F_ATOM_ERROR);
+      return parser.error = F_ATOM_ERROR;
     default:
-      return on_error(TAPE_ERROR);
+      return parser.error = TAPE_ERROR;
     }
   }
 
   really_inline void init() {
-    current_string_buf_loc = doc_parser.doc.string_buf.get();
-    doc_parser.current_loc = 0;
-    doc_parser.valid = false;
-    doc_parser.error = UNINITIALIZED;
+    log_start();
+    parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    init(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
+  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
+    // If there are no structurals left, return EMPTY
+    if (at_end(parser.n_structural_indexes)) {
+      return parser.error = EMPTY;
     }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
+
+    init();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return on_error(DEPTH_ERROR);
+      return parser.error = DEPTH_ERROR;
     }
     return SUCCESS;
   }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
+  really_inline void log_value(const char *type) {
+    logger::log_line(*this, "", type, "");
   }
-};
+
+  static really_inline void log_start() {
+    logger::log_start();
+  }
+
+  really_inline void log_start_value(const char *type) {
+    logger::log_line(*this, "+", type, "");
+    if (logger::LOG_ENABLED) { logger::log_depth++; }
+  }
+
+  really_inline void log_end_value(const char *type) {
+    if (logger::LOG_ENABLED) { logger::log_depth--; }
+    logger::log_line(*this, "-", type, "");
+  }
+
+  really_inline void log_error(const char *error) {
+    logger::log_line(*this, "", "ERROR", error);
+  }
+}; // struct structural_parser
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-} // namespace stage2
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+template<bool STREAMING>
+WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
+  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  error_code result = parser.start(addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.structurals.current_char()) {
+  switch (parser.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
+    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
+        goto error;
+      }
+    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
+  case 't':
+    parser.log_value("true");
+    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+    goto finish;
+  case 'f':
+    parser.log_value("false");
+    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+    goto finish;
+  case 'n':
+    parser.log_value("null");
+    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::NULL_VALUE);
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
+    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -4961,43 +5174,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string() );
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
+  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
+    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  CONTINUE( parser.parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.advance_char() == ']') {
+  if (parser.peek_next_char() == ']') {
+    parser.advance_char();
     parser.end_array();
     goto scope_end;
   }
@@ -5012,12 +5227,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
+    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -5028,194 +5243,298 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   return parser.error();
 }
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2/structural_parser.h */
-/* begin file src/generic/stage2/streaming_structural_parser.h */
-namespace stage2 {
-
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
-
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    init(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
-
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
-
+} // namespace {}
 } // namespace stage2
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
-  error_code result = parser.start(len, addresses.finish);
+WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  error_code result = stage2::parse_structurals<false>(*this, _doc);
   if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
-
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.increment_count();
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
-
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
-//
-// Array parser parsers
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
+  // If we didn't make it to the end, it's an error
+  if ( next_structural_index != n_structural_indexes ) {
+    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return error = TAPE_ERROR;
   }
-  parser.increment_count();
-
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.increment_count();
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+  return SUCCESS;
+}
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::parse_structurals<true>(*this, _doc);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
-error:
-  return parser.error();
+WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  error_code err = stage1(_buf, _len, false);
+  if (err) { return err; }
+  return stage2(_doc);
 }
-/* end file src/generic/stage2/streaming_structural_parser.h */
 
 } // namespace arm64
 } // namespace simdjson
-
-#endif // SIMDJSON_ARM64_STAGE2_H
-/* end file src/generic/stage2/streaming_structural_parser.h */
+/* end file src/generic/stage2/tape_writer.h */
 #endif
 #if SIMDJSON_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/stage1.cpp */
+/* begin file src/fallback/implementation.cpp */
 /* fallback/implementation.h already included: #include "fallback/implementation.h" */
+/* begin file src/fallback/dom_parser_implementation.h */
+#ifndef SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+
+/* isadetection.h already included: #include "isadetection.h" */
 
 namespace simdjson {
 namespace fallback {
-namespace stage1 {
 
-class structural_scanner {
+/* begin file src/generic/dom_parser_implementation.h */
+// expectation: sizeof(scope_descriptor) = 64/8.
+struct scope_descriptor {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+typedef void* ret_address_t;
+#else
+typedef char ret_address_t;
+#endif
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
 public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<scope_descriptor[]> containing_scope{};
+  /** Return address of each open { or [ */
+  std::unique_ptr<ret_address_t[]> ret_address{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+  error_code error{UNINITIALIZED};
+
+  really_inline dom_parser_implementation();
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
+
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+};
 
-really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming)
-  : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {}
+/* begin file src/generic/stage1/allocate.h */
+namespace stage1 {
+namespace allocate {
 
-really_inline void add_structural() {
-  *next_structural_index = idx;
-  next_structural_index++;
+//
+// Allocates stage 1 internal state and outputs in the parser
+//
+really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
+  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!parser.structural_indexes) { return MEMALLOC; }
+  parser.structural_indexes[0] = 0;
+  parser.n_structural_indexes = 0;
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage1
+/* end file src/generic/stage1/allocate.h */
+/* begin file src/generic/stage2/allocate.h */
+namespace stage2 {
+namespace allocate {
+
+//
+// Allocates stage 2 internal state and outputs in the parser
+//
+really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
+  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+  if (!parser.ret_address || !parser.containing_scope) {
+    return MEMALLOC;
+  }
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage2
+/* end file src/generic/stage2/allocate.h */
+
+really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+// Leaving these here so they can be inlined if so desired
+WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  error_code err = stage1::allocate::set_capacity(*this, capacity);
+  if (err) { _capacity = 0; return err; }
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+  if (err) { _max_depth = 0; return err; }
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+/* end file src/generic/stage2/allocate.h */
+
+} // namespace fallback
+} // namespace simdjson
+
+#endif // SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
+
+TARGET_HASWELL
+
+namespace simdjson {
+namespace fallback {
+
+WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  dst->set_capacity(capacity);
+  dst->set_max_depth(max_depth);
+  return SUCCESS;
+}
+
+} // namespace fallback
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/fallback/dom_parser_implementation.cpp */
+/* fallback/implementation.h already included: #include "fallback/implementation.h" */
+/* fallback/dom_parser_implementation.h already included: #include "fallback/dom_parser_implementation.h" */
+
+//
+// Stage 1
+//
+namespace simdjson {
+namespace fallback {
+namespace stage1 {
+
+/* begin file src/generic/stage1/find_next_document_index.h */
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // TODO don't count separately, just figure out depth
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  return 0;
+}
+
+// Skip the last character if it is partial
+really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+/* end file src/generic/stage1/find_next_document_index.h */
+
+class structural_scanner {
+public:
+
+really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
+  : buf{_parser.buf},
+    next_structural_index{_parser.structural_indexes.get()},
+    parser{_parser},
+    len{static_cast<uint32_t>(_parser.len)},
+    partial{_partial} {
+}
+
+really_inline void add_structural() {
+  *next_structural_index = idx;
+  next_structural_index++;
 }
 
 really_inline bool is_continuation(uint8_t c) {
@@ -5234,7 +5553,12 @@ really_inline void validate_utf8_character() {
   // 2-byte
   if ((buf[idx] & 0b00100000) == 0) {
     // missing continuation
-    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
+    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
+      if (idx+1 > len && partial) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
     // overlong: 1100000_ 10______
     if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
     idx += 2;
@@ -5244,7 +5568,12 @@ really_inline void validate_utf8_character() {
   // 3-byte
   if ((buf[idx] & 0b00010000) == 0) {
     // missing continuation
-    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
+    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
+      if (idx+2 > len && partial) { idx = len; return; }
+      error = UTF8_ERROR;
+      idx++;
+      return;
+    }
     // overlong: 11100000 100_____ ________
     if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
     // surrogates: U+D800-U+DFFF 11101101 101_____
@@ -5255,7 +5584,12 @@ really_inline void validate_utf8_character() {
 
   // 4-byte
   // missing continuation
-  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
+  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
+    if (idx+2 > len && partial) { idx = len; return; }
+    error = UTF8_ERROR;
+    idx++;
+    return;
+  }
   // overlong: 11110000 1000____ ________ ________
   if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
   // too large: > U+10FFFF:
@@ -5280,7 +5614,7 @@ really_inline void validate_string() {
       idx++;
     }
   }
-  if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
+  if (idx >= len && !partial) { error = UNCLOSED_STRING; }
 }
 
 really_inline bool is_whitespace_or_operator(uint8_t c) {
@@ -5321,33 +5655,46 @@ really_inline error_code scan() {
         break;
     }
   }
-  if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) {
+  *next_structural_index = len;
+  // We pad beyond.
+  // https://github.com/simdjson/simdjson/issues/906
+  next_structural_index[1] = len;
+  next_structural_index[2] = 0;
+  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
+  parser.next_structural_index = 0;
+
+  if (unlikely(parser.n_structural_indexes == 0)) {
     return EMPTY;
   }
-  *next_structural_index = len;
-  next_structural_index++;
-  doc_parser.n_structural_indexes = uint32_t(next_structural_index - doc_parser.structural_indexes.get());
+
+  if (partial) {
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  }
+
   return error;
 }
 
 private:
   const uint8_t *buf;
   uint32_t *next_structural_index;
-  parser &doc_parser;
-  uint32_t idx;
+  dom_parser_implementation &parser;
   uint32_t len;
-  error_code error;
-  bool streaming;
+  uint32_t idx{0};
+  error_code error{SUCCESS};
+  bool partial;
 }; // structural_scanner
 
 } // namespace stage1
 
 
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  if (unlikely(len > parser.capacity())) {
-    return CAPACITY;
-  }
-  stage1::structural_scanner scanner(buf, uint32_t(len), parser, streaming);
+WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  stage1::structural_scanner scanner(*this, partial);
   return scanner.scan();
 }
 
@@ -5409,10 +5756,10 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 
 } // namespace fallback
 } // namespace simdjson
-/* end file src/fallback/stage1.cpp */
-/* begin file src/fallback/stage2.cpp */
 
-/* fallback/implementation.h already included: #include "fallback/implementation.h" */
+//
+// Stage 2
+//
 /* begin file src/fallback/stringparsing.h */
 #ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
 #define SIMDJSON_FALLBACK_STRINGPARSING_H
@@ -5872,10 +6219,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -5895,26 +6242,6 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -5992,14 +6319,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -6012,12 +6339,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.write_s64(i);
+      writer.append_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.write_u64(i);
+      writer.append_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -6027,7 +6354,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.write_double(d);
+    writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -6051,10 +6378,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W writer) {
+                                W &writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.write_s64(0);        // always write zero
+  writer.append_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -6074,7 +6401,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -6198,7 +6525,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (digit_count >= 19) {
+      if (unlikely(digit_count >= 19)) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -6206,14 +6533,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        return slow_float_parsing((const char *) src, writer);
+        bool success = slow_float_parsing((const char *) src, writer);
+        // The number was already written, but we made a copy of the writer
+        // when we passed it to the parse_large_integer() function, so 
+        writer.skip_double();
+        return success;
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, writer);
+      bool success = slow_float_parsing((const char *) src, writer);
+      // The number was already written, but we made a copy of the writer when we passed it to the
+      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+      writer.skip_double();
+      return success;
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -6222,7 +6557,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.write_double(d);
+      writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -6237,10 +6572,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, writer, found_minus);
+      bool success = parse_large_integer(src, writer, found_minus);
+      // The number was already written, but we made a copy of the writer
+      // when we passed it to the parse_large_integer() function, so 
+      writer.skip_large_integer();
+      return success;
     }
     i = negative ? 0 - i : i;
-    writer.write_s64(i);
+    writer.append_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -6263,6 +6602,72 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
 namespace simdjson {
 namespace fallback {
 
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace logger {
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+  static constexpr const bool LOG_ENABLED = false;
+  static constexpr const int LOG_EVENT_LEN = 30;
+  static constexpr const int LOG_BUFFER_LEN = 20;
+  static constexpr const int LOG_DETAIL_LEN = 50;
+  static constexpr const int LOG_INDEX_LEN = 10;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static really_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static really_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
+    }
+  }
+
+  static really_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line of 
+  template<typename S>
+  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(structurals.current()[i]));
+        }
+        printf(" ");
+      }
+      printf("|    %c ", printable_char(structurals.current_char()));
+      printf("|    %c ", printable_char(structurals.peek_next_char()));
+      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
+      printf("| %5u ", structurals.next_tape_index());
+      printf("| %-*s ", LOG_DETAIL_LEN, detail);
+      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+      printf("|\n");
+    }
+  }
+} // namespace logger
+
+/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -6321,26 +6726,34 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf},
-     len{_len},
-     structural_indexes{_structural_indexes},
-     next_structural{next_structural_index}
-    {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+  const uint8_t* const buf;
+  uint32_t *current_structural;
+  dom_parser_implementation &parser;
+
+  // Start a structural 
+  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+    : buf{_parser.buf},
+      current_structural{&_parser.structural_indexes[start_structural_index]},
+      parser{_parser} {
+  }
+  // Get the buffer position of the current structural character
+  really_inline const uint8_t* current() {
+    return &buf[*current_structural];
   }
+  // Get the current structural character
   really_inline char current_char() {
-    return c;
+    return buf[*current_structural];
   }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
+  // Get the next structural character without advancing
+  really_inline char peek_next_char() {
+    return buf[*(current_structural+1)];
+  }
+  really_inline char advance_char() {
+    current_structural++;
+    return buf[*current_structural];
   }
   really_inline size_t remaining_len() {
-    return len - idx;
+    return parser.len - *current_structural;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -6357,32 +6770,25 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
+    memcpy(copy, buf, parser.len);
+    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
+    return current_structural >= &parser.structural_indexes[n_structural_indexes];
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
+    return current_structural == &parser.structural_indexes[n_structural_indexes];
   }
-  really_inline size_t next_structural_index() {
-    return next_structural;
+  really_inline bool at_beginning() {
+    return current_structural == parser.structural_indexes.get();
   }
-
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx{0}; // location of the structural character in the input (buf)
-  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -6394,8 +6800,105 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
+namespace { // Make everything here private
+
+/* begin file src/generic/stage2/tape_writer.h */
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+  
+  /** Write a signed 64-bit value to tape. */
+  really_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  really_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  really_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  really_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  really_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  really_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+really_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+really_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+really_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+really_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
 
-using internal::ret_address;
+really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -6426,102 +6929,88 @@ using internal::ret_address;
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
+  ret_address_t array_begin;
+  ret_address_t array_continue;
+  ret_address_t error;
+  ret_address_t finish;
+  ret_address_t object_begin;
+  ret_address_t object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct number_writer {
-  parser &doc_parser;
-  
-  really_inline void write_s64(int64_t value) noexcept {
-    write_tape(0, internal::tape_type::INT64);
-    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
-    ++doc_parser.current_loc;
-  }
-  really_inline void write_u64(uint64_t value) noexcept {
-    write_tape(0, internal::tape_type::UINT64);
-    doc_parser.doc.tape[doc_parser.current_loc++] = value;
-  }
-  really_inline void write_double(double value) noexcept {
-    write_tape(0, internal::tape_type::DOUBLE);
-    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
-    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
-    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
-  }
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
-  }
-}; // struct number_writer
-
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
+struct structural_parser : structural_iterator {
+  /** Lets you append to the tape */
+  tape_writer tape;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc{};
-  uint32_t depth;
-
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
-
-  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
-    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
-    doc_parser.containing_scope[depth].count = 0;
-    write_tape(0, type); // if the document is correct, this gets rewritten later
-    doc_parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc;
+  /** Current depth (nested objects and arrays) */
+  uint32_t depth{0};
+
+  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+    : structural_iterator(_parser, start_structural_index),
+      tape{parser.doc->tape.get()},
+      current_string_buf_loc{parser.doc->string_buf.get()} {
+  }
+
+  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
+    parser.containing_scope[depth].tape_index = next_tape_index();
+    parser.containing_scope[depth].count = 0;
+    tape.skip(); // We don't actually *write* the start element until the end.
+    parser.ret_address[depth] = continue_state;
     depth++;
-    return depth >= doc_parser.max_depth();
+    bool exceeded_max_depth = depth >= parser.max_depth();
+    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
+    return exceeded_max_depth;
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    return start_scope(internal::tape_type::ROOT, continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
+    log_start_value("document");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_OBJECT, continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
+    log_start_value("object");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_ARRAY, continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
+    log_start_value("array");
+    return start_scope(continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type type) noexcept {
+  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
     depth--;
-    // write our doc.tape location to the header scope
+    // write our doc->tape location to the header scope
     // The root scope gets written *at* the previous location.
-    write_tape(doc_parser.containing_scope[depth].tape_index, type);
+    tape.append(parser.containing_scope[depth].tape_index, end);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
-    const uint32_t count = doc_parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+    const uint32_t count = parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
-    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
+    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+  }
+
+  really_inline uint32_t next_tape_index() {
+    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
   }
 
   really_inline void end_object() {
-    end_scope(internal::tape_type::END_OBJECT);
+    log_end_value("object");
+    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    end_scope(internal::tape_type::END_ARRAY);
+    log_end_value("array");
+    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    end_scope(internal::tape_type::ROOT);
-  }
-
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+    log_end_value("document");
+    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -6529,17 +7018,16 @@ struct structural_parser {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    /* we advance the point, accounting for the fact that we have a NULL
-      * termination         */
-    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
+    // we advance the point, accounting for the fact that we have a NULL termination
+    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline bool on_end_string(uint8_t *dst) noexcept {
+  really_inline void on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -6549,73 +7037,49 @@ struct structural_parser {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
-    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string() {
+  WARN_UNUSED really_inline bool parse_string(bool key = false) {
+    log_value(key ? "key" : "string");
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
+    dst = stringparsing::parse_string(current(), dst);
     if (dst == nullptr) {
+      log_error("Invalid escape in string");
       return true;
     }
-    return !on_end_string(dst);
+    on_end_string(dst);
+    return false;
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    number_writer writer{doc_parser};
-    return !numberparsing::parse_number(src, found_minus, writer);
+    log_value("number");
+    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+    if (!succeeded) { log_error("Invalid number"); }
+    return !succeeded;
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
-
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
-
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
+    return parse_number(current(), found_minus);
   }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
+  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
+    switch (advance_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
+    case 't':
+      log_value("true");
+      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
+      tape.append(0, internal::tape_type::TRUE_VALUE);
+      return continue_state;
+    case 'f':
+      log_value("false");
+      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
+      tape.append(0, internal::tape_type::FALSE_VALUE);
+      return continue_state;
+    case 'n':
+      log_value("null");
+      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
+      tape.append(0, internal::tape_type::NULL_VALUE);
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -6631,40 +7095,27 @@ struct structural_parser {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
+      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
     end_document();
+    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
+
     if (depth != 0) {
-      return on_error(TAPE_ERROR);
+      log_error("Unclosed objects or arrays!");
+      return parser.error = TAPE_ERROR;
     }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-
-    return on_success(SUCCESS);
-  }
 
-  really_inline error_code on_error(error_code new_error_code) noexcept {
-    doc_parser.error = new_error_code;
-    return new_error_code;
-  }
-  really_inline error_code on_success(error_code success_code) noexcept {
-    doc_parser.error = success_code;
-    doc_parser.valid = true;
-    return success_code;
+    return SUCCESS;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
+    /* We do not need the next line because this is done by parser.init_stage2(),
     * pessimistically.
-    * doc_parser.is_valid  = false;
+    * parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -6672,12 +7123,12 @@ struct structural_parser {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return on_error(DEPTH_ERROR);
+    if (depth >= parser.max_depth()) {
+      return parser.error = DEPTH_ERROR;
     }
-    switch (structurals.current_char()) {
+    switch (current_char()) {
     case '"':
-      return on_error(STRING_ERROR);
+      return parser.error = STRING_ERROR;
     case '0':
     case '1':
     case '2':
@@ -6689,92 +7140,124 @@ struct structural_parser {
     case '8':
     case '9':
     case '-':
-      return on_error(NUMBER_ERROR);
+      return parser.error = NUMBER_ERROR;
     case 't':
-      return on_error(T_ATOM_ERROR);
+      return parser.error = T_ATOM_ERROR;
     case 'n':
-      return on_error(N_ATOM_ERROR);
+      return parser.error = N_ATOM_ERROR;
     case 'f':
-      return on_error(F_ATOM_ERROR);
+      return parser.error = F_ATOM_ERROR;
     default:
-      return on_error(TAPE_ERROR);
+      return parser.error = TAPE_ERROR;
     }
   }
 
   really_inline void init() {
-    current_string_buf_loc = doc_parser.doc.string_buf.get();
-    doc_parser.current_loc = 0;
-    doc_parser.valid = false;
-    doc_parser.error = UNINITIALIZED;
+    log_start();
+    parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    init(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
+  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
+    // If there are no structurals left, return EMPTY
+    if (at_end(parser.n_structural_indexes)) {
+      return parser.error = EMPTY;
     }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
+
+    init();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return on_error(DEPTH_ERROR);
+      return parser.error = DEPTH_ERROR;
     }
     return SUCCESS;
   }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
+  really_inline void log_value(const char *type) {
+    logger::log_line(*this, "", type, "");
+  }
+
+  static really_inline void log_start() {
+    logger::log_start();
+  }
+
+  really_inline void log_start_value(const char *type) {
+    logger::log_line(*this, "+", type, "");
+    if (logger::LOG_ENABLED) { logger::log_depth++; }
+  }
+
+  really_inline void log_end_value(const char *type) {
+    if (logger::LOG_ENABLED) { logger::log_depth--; }
+    logger::log_line(*this, "-", type, "");
   }
-};
+
+  really_inline void log_error(const char *error) {
+    logger::log_line(*this, "", "ERROR", error);
+  }
+}; // struct structural_parser
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-} // namespace stage2
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+template<bool STREAMING>
+WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
+  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  error_code result = parser.start(addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.structurals.current_char()) {
+  switch (parser.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
+    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
+        goto error;
+      }
+    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
+  case 't':
+    parser.log_value("true");
+    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+    goto finish;
+  case 'f':
+    parser.log_value("false");
+    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+    goto finish;
+  case 'n':
+    parser.log_value("null");
+    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::NULL_VALUE);
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
+    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -6785,43 +7268,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string() );
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
+  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
+    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  CONTINUE( parser.parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.advance_char() == ']') {
+  if (parser.peek_next_char() == ']') {
+    parser.advance_char();
     parser.end_array();
     goto scope_end;
   }
@@ -6836,12 +7321,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
+    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -6852,178 +7337,191 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   return parser.error();
 }
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2/structural_parser.h */
-/* begin file src/generic/stage2/streaming_structural_parser.h */
-namespace stage2 {
-
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
+} // namespace {}
+} // namespace stage2
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    init(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  error_code result = stage2::parse_structurals<false>(*this, _doc);
+  if (result) { return result; }
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
+  // If we didn't make it to the end, it's an error
+  if ( next_structural_index != n_structural_indexes ) {
+    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return error = TAPE_ERROR;
   }
-};
 
-} // namespace stage2
+  return SUCCESS;
+}
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
+WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::parse_structurals<true>(*this, _doc);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  error_code err = stage1(_buf, _len, false);
+  if (err) { return err; }
+  return stage2(_doc);
+}
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.increment_count();
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+} // namespace fallback
+} // namespace simdjson
+/* end file src/generic/stage2/tape_writer.h */
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/implementation.cpp */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+/* begin file src/haswell/dom_parser_implementation.h */
+#ifndef SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
 
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
+/* isadetection.h already included: #include "isadetection.h" */
 
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+namespace simdjson {
+namespace haswell {
+
+/* begin file src/generic/dom_parser_implementation.h */
+// expectation: sizeof(scope_descriptor) = 64/8.
+struct scope_descriptor {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+typedef void* ret_address_t;
+#else
+typedef char ret_address_t;
+#endif
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<scope_descriptor[]> containing_scope{};
+  /** Return address of each open { or [ */
+  std::unique_ptr<ret_address_t[]> ret_address{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+  error_code error{UNINITIALIZED};
+
+  really_inline dom_parser_implementation();
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
+
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+};
+
+/* begin file src/generic/stage1/allocate.h */
+namespace stage1 {
+namespace allocate {
 
 //
-// Array parser parsers
+// Allocates stage 1 internal state and outputs in the parser
 //
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
-  }
-  parser.increment_count();
+really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
+  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!parser.structural_indexes) { return MEMALLOC; }
+  parser.structural_indexes[0] = 0;
+  parser.n_structural_indexes = 0;
+  return SUCCESS;
+}
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+} // namespace allocate
+} // namespace stage1
+/* end file src/generic/stage1/allocate.h */
+/* begin file src/generic/stage2/allocate.h */
+namespace stage2 {
+namespace allocate {
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.increment_count();
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
+//
+// Allocates stage 2 internal state and outputs in the parser
+//
+really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
+  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+  if (!parser.ret_address || !parser.containing_scope) {
+    return MEMALLOC;
   }
+  return SUCCESS;
+}
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+} // namespace allocate
+} // namespace stage2
+/* end file src/generic/stage2/allocate.h */
 
-error:
-  return parser.error();
+really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+// Leaving these here so they can be inlined if so desired
+WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  error_code err = stage1::allocate::set_capacity(*this, capacity);
+  if (err) { _capacity = 0; return err; }
+  _capacity = capacity;
+  return SUCCESS;
 }
-/* end file src/generic/stage2/streaming_structural_parser.h */
 
-} // namespace fallback
+WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+  if (err) { _max_depth = 0; return err; }
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+/* end file src/generic/stage2/allocate.h */
+
+} // namespace haswell
 } // namespace simdjson
-/* end file src/generic/stage2/streaming_structural_parser.h */
-#endif
-#if SIMDJSON_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/stage1.cpp */
 
+#endif // SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
+
+TARGET_HASWELL
+
+namespace simdjson {
+namespace haswell {
+
+WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  dst->set_capacity(capacity);
+  dst->set_max_depth(max_depth);
+  return SUCCESS;
+}
+
+} // namespace haswell
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/haswell/dom_parser_implementation.cpp */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+/* haswell/dom_parser_implementation.h already included: #include "haswell/dom_parser_implementation.h" */
+
+//
+// Stage 1
+//
 /* begin file src/haswell/bitmask.h */
 #ifndef SIMDJSON_HASWELL_BITMASK_H
 #define SIMDJSON_HASWELL_BITMASK_H
@@ -7568,7 +8066,6 @@ UNTARGET_REGION
 #endif // SIMDJSON_HASWELL_SIMD_H
 /* end file src/haswell/bitmanipulation.h */
 /* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
 
 TARGET_HASWELL
 namespace simdjson {
@@ -7627,24 +8124,21 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  really_inline size_t block_index();
+  really_inline bool has_full_block() const;
+  really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  really_inline size_t get_remainder(uint8_t *dst) const;
+  really_inline void advance();
 private:
   const uint8_t *buf;
   const size_t len;
@@ -7652,6 +8146,18 @@ struct buf_block_reader {
   size_t idx;
 };
 
+constexpr const int TITLE_SIZE = 12;
+
+// Routines to print masks and text for debugging bitmask operations
+UNUSED static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -7671,6 +8177,34 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
+
+template<size_t STEP_SIZE>
+really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -7970,13 +8504,15 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  if (likely(reader.get_remainder(block)) > 0) {
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -7989,6 +8525,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
 }
 
+/* begin file src/generic/stage1/find_next_document_index.h */
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // TODO don't count separately, just figure out depth
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  return 0;
+}
+
+// Skip the last character if it is partial
+really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -8039,9 +8663,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
+//
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
+//
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -8056,6 +8680,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 using namespace simd;
 
 namespace utf8_validation {
+  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -8100,7 +8725,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
+      0, 0, 0, 0,
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -8131,214 +8756,6 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -8476,16 +8893,22 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes)
-  : indexer{structural_indexes} {}
+  really_inline json_structural_indexer(uint32_t *structural_indexes);
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -8494,65 +8917,8 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
-
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
-
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
-
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
-
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
-}
-
-template<>
-really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  simd::simd8x64<uint8_t> in_2(block+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1, reader.block_index());
-  this->next(in_2, block_2, reader.block_index()+64);
-  reader.advance();
-}
-
-template<>
-really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  json_block block_1 = scanner.next(in_1);
-  this->next(in_1, block_1, reader.block_index());
-  reader.advance();
-}
+really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
 //
 // PERF NOTES:
 // We pipe 2 inputs through these stages:
@@ -8570,41 +8936,116 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
 // workout.
 //
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
 template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
   if (unlikely(len > parser.capacity())) { return CAPACITY; }
+  if (partial) { len = trim_partial_utf8(buf, len); }
 
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_structural_indexer indexer(parser.structural_indexes.get());
+
+  // Read all but the last block
   while (reader.has_full_block()) {
     indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
-  }
-
-  return indexer.finish(parser, reader.block_index(), len, streaming);
-}
+  // Take care of the last block (will always be there unless file is empty)
+  uint8_t block[STEP_SIZE];
+  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
+  indexer.step<STEP_SIZE>(block, reader);
 
-} // namespace stage1
-/* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
+  return indexer.finish(parser, reader.block_index(), len, partial);
 }
 
-} // namespace haswell
+template<>
+really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
 
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+
+  error_code error = scanner.finish(partial);
+  if (unlikely(error != SUCCESS)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial) {
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  }
+  return checker.errors();
+}
+
+} // namespace stage1
+/* end file src/generic/stage1/json_structural_indexer.h */
+WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
+}
+
+} // namespace haswell
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage1/json_structural_indexer.h */
-/* begin file src/haswell/stage2.cpp */
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
+
+//
+// Stage 2
+//
 /* begin file src/haswell/stringparsing.h */
 #ifndef SIMDJSON_HASWELL_STRINGPARSING_H
 #define SIMDJSON_HASWELL_STRINGPARSING_H
@@ -9015,10 +9456,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -9038,26 +9479,6 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -9135,14 +9556,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -9155,12 +9576,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.write_s64(i);
+      writer.append_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.write_u64(i);
+      writer.append_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -9170,7 +9591,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.write_double(d);
+    writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -9194,10 +9615,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W writer) {
+                                W &writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.write_s64(0);        // always write zero
+  writer.append_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -9217,7 +9638,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -9341,7 +9762,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (digit_count >= 19) {
+      if (unlikely(digit_count >= 19)) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -9349,14 +9770,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        return slow_float_parsing((const char *) src, writer);
+        bool success = slow_float_parsing((const char *) src, writer);
+        // The number was already written, but we made a copy of the writer
+        // when we passed it to the parse_large_integer() function, so 
+        writer.skip_double();
+        return success;
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, writer);
+      bool success = slow_float_parsing((const char *) src, writer);
+      // The number was already written, but we made a copy of the writer when we passed it to the
+      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+      writer.skip_double();
+      return success;
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -9365,7 +9794,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.write_double(d);
+      writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -9380,10 +9809,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, writer, found_minus);
+      bool success = parse_large_integer(src, writer, found_minus);
+      // The number was already written, but we made a copy of the writer
+      // when we passed it to the parse_large_integer() function, so 
+      writer.skip_large_integer();
+      return success;
     }
     i = negative ? 0 - i : i;
-    writer.write_s64(i);
+    writer.append_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -9408,6 +9841,72 @@ TARGET_HASWELL
 namespace simdjson {
 namespace haswell {
 
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace logger {
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+  static constexpr const bool LOG_ENABLED = false;
+  static constexpr const int LOG_EVENT_LEN = 30;
+  static constexpr const int LOG_BUFFER_LEN = 20;
+  static constexpr const int LOG_DETAIL_LEN = 50;
+  static constexpr const int LOG_INDEX_LEN = 10;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static really_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static really_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
+    }
+  }
+
+  static really_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line of 
+  template<typename S>
+  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(structurals.current()[i]));
+        }
+        printf(" ");
+      }
+      printf("|    %c ", printable_char(structurals.current_char()));
+      printf("|    %c ", printable_char(structurals.peek_next_char()));
+      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
+      printf("| %5u ", structurals.next_tape_index());
+      printf("| %-*s ", LOG_DETAIL_LEN, detail);
+      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+      printf("|\n");
+    }
+  }
+} // namespace logger
+
+/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -9466,26 +9965,34 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf},
-     len{_len},
-     structural_indexes{_structural_indexes},
-     next_structural{next_structural_index}
-    {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+  const uint8_t* const buf;
+  uint32_t *current_structural;
+  dom_parser_implementation &parser;
+
+  // Start a structural 
+  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+    : buf{_parser.buf},
+      current_structural{&_parser.structural_indexes[start_structural_index]},
+      parser{_parser} {
   }
+  // Get the buffer position of the current structural character
+  really_inline const uint8_t* current() {
+    return &buf[*current_structural];
+  }
+  // Get the current structural character
   really_inline char current_char() {
-    return c;
+    return buf[*current_structural];
   }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
+  // Get the next structural character without advancing
+  really_inline char peek_next_char() {
+    return buf[*(current_structural+1)];
+  }
+  really_inline char advance_char() {
+    current_structural++;
+    return buf[*current_structural];
   }
   really_inline size_t remaining_len() {
-    return len - idx;
+    return parser.len - *current_structural;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -9502,32 +10009,25 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
+    memcpy(copy, buf, parser.len);
+    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
+    return current_structural >= &parser.structural_indexes[n_structural_indexes];
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
+    return current_structural == &parser.structural_indexes[n_structural_indexes];
   }
-  really_inline size_t next_structural_index() {
-    return next_structural;
+  really_inline bool at_beginning() {
+    return current_structural == parser.structural_indexes.get();
   }
-
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx{0}; // location of the structural character in the input (buf)
-  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -9539,8 +10039,105 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
+namespace { // Make everything here private
+
+/* begin file src/generic/stage2/tape_writer.h */
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+  
+  /** Write a signed 64-bit value to tape. */
+  really_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  really_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  really_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  really_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  really_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  really_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+really_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+really_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
+
+/** Write a double value to tape. */
+really_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+really_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
 
-using internal::ret_address;
+really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -9571,102 +10168,88 @@ using internal::ret_address;
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
+  ret_address_t array_begin;
+  ret_address_t array_continue;
+  ret_address_t error;
+  ret_address_t finish;
+  ret_address_t object_begin;
+  ret_address_t object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct number_writer {
-  parser &doc_parser;
-  
-  really_inline void write_s64(int64_t value) noexcept {
-    write_tape(0, internal::tape_type::INT64);
-    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
-    ++doc_parser.current_loc;
-  }
-  really_inline void write_u64(uint64_t value) noexcept {
-    write_tape(0, internal::tape_type::UINT64);
-    doc_parser.doc.tape[doc_parser.current_loc++] = value;
-  }
-  really_inline void write_double(double value) noexcept {
-    write_tape(0, internal::tape_type::DOUBLE);
-    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
-    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
-    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
-  }
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
-  }
-}; // struct number_writer
-
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
+struct structural_parser : structural_iterator {
+  /** Lets you append to the tape */
+  tape_writer tape;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc{};
-  uint32_t depth;
-
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
-
-  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
-    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
-    doc_parser.containing_scope[depth].count = 0;
-    write_tape(0, type); // if the document is correct, this gets rewritten later
-    doc_parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc;
+  /** Current depth (nested objects and arrays) */
+  uint32_t depth{0};
+
+  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+    : structural_iterator(_parser, start_structural_index),
+      tape{parser.doc->tape.get()},
+      current_string_buf_loc{parser.doc->string_buf.get()} {
+  }
+
+  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
+    parser.containing_scope[depth].tape_index = next_tape_index();
+    parser.containing_scope[depth].count = 0;
+    tape.skip(); // We don't actually *write* the start element until the end.
+    parser.ret_address[depth] = continue_state;
     depth++;
-    return depth >= doc_parser.max_depth();
+    bool exceeded_max_depth = depth >= parser.max_depth();
+    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
+    return exceeded_max_depth;
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    return start_scope(internal::tape_type::ROOT, continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
+    log_start_value("document");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_OBJECT, continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
+    log_start_value("object");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_ARRAY, continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
+    log_start_value("array");
+    return start_scope(continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type type) noexcept {
+  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
     depth--;
-    // write our doc.tape location to the header scope
+    // write our doc->tape location to the header scope
     // The root scope gets written *at* the previous location.
-    write_tape(doc_parser.containing_scope[depth].tape_index, type);
+    tape.append(parser.containing_scope[depth].tape_index, end);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
-    const uint32_t count = doc_parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+    const uint32_t count = parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
-    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
+    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+  }
+
+  really_inline uint32_t next_tape_index() {
+    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
   }
 
   really_inline void end_object() {
-    end_scope(internal::tape_type::END_OBJECT);
+    log_end_value("object");
+    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    end_scope(internal::tape_type::END_ARRAY);
+    log_end_value("array");
+    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    end_scope(internal::tape_type::ROOT);
-  }
-
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+    log_end_value("document");
+    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -9674,17 +10257,16 @@ struct structural_parser {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    /* we advance the point, accounting for the fact that we have a NULL
-      * termination         */
-    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
+    // we advance the point, accounting for the fact that we have a NULL termination
+    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline bool on_end_string(uint8_t *dst) noexcept {
+  really_inline void on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -9694,73 +10276,49 @@ struct structural_parser {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
-    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string() {
+  WARN_UNUSED really_inline bool parse_string(bool key = false) {
+    log_value(key ? "key" : "string");
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
+    dst = stringparsing::parse_string(current(), dst);
     if (dst == nullptr) {
+      log_error("Invalid escape in string");
       return true;
     }
-    return !on_end_string(dst);
+    on_end_string(dst);
+    return false;
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    number_writer writer{doc_parser};
-    return !numberparsing::parse_number(src, found_minus, writer);
+    log_value("number");
+    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+    if (!succeeded) { log_error("Invalid number"); }
+    return !succeeded;
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
+    return parse_number(current(), found_minus);
   }
 
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
-
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
-
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
+  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
+    switch (advance_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
+    case 't':
+      log_value("true");
+      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
+      tape.append(0, internal::tape_type::TRUE_VALUE);
+      return continue_state;
+    case 'f':
+      log_value("false");
+      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
+      tape.append(0, internal::tape_type::FALSE_VALUE);
+      return continue_state;
+    case 'n':
+      log_value("null");
+      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
+      tape.append(0, internal::tape_type::NULL_VALUE);
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -9776,40 +10334,27 @@ struct structural_parser {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
+      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
     end_document();
+    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
+
     if (depth != 0) {
-      return on_error(TAPE_ERROR);
+      log_error("Unclosed objects or arrays!");
+      return parser.error = TAPE_ERROR;
     }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-
-    return on_success(SUCCESS);
-  }
 
-  really_inline error_code on_error(error_code new_error_code) noexcept {
-    doc_parser.error = new_error_code;
-    return new_error_code;
-  }
-  really_inline error_code on_success(error_code success_code) noexcept {
-    doc_parser.error = success_code;
-    doc_parser.valid = true;
-    return success_code;
+    return SUCCESS;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
+    /* We do not need the next line because this is done by parser.init_stage2(),
     * pessimistically.
-    * doc_parser.is_valid  = false;
+    * parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -9817,12 +10362,12 @@ struct structural_parser {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return on_error(DEPTH_ERROR);
+    if (depth >= parser.max_depth()) {
+      return parser.error = DEPTH_ERROR;
     }
-    switch (structurals.current_char()) {
+    switch (current_char()) {
     case '"':
-      return on_error(STRING_ERROR);
+      return parser.error = STRING_ERROR;
     case '0':
     case '1':
     case '2':
@@ -9834,302 +10379,173 @@ struct structural_parser {
     case '8':
     case '9':
     case '-':
-      return on_error(NUMBER_ERROR);
+      return parser.error = NUMBER_ERROR;
     case 't':
-      return on_error(T_ATOM_ERROR);
+      return parser.error = T_ATOM_ERROR;
     case 'n':
-      return on_error(N_ATOM_ERROR);
+      return parser.error = N_ATOM_ERROR;
     case 'f':
-      return on_error(F_ATOM_ERROR);
+      return parser.error = F_ATOM_ERROR;
     default:
-      return on_error(TAPE_ERROR);
+      return parser.error = TAPE_ERROR;
     }
   }
 
   really_inline void init() {
-    current_string_buf_loc = doc_parser.doc.string_buf.get();
-    doc_parser.current_loc = 0;
-    doc_parser.valid = false;
-    doc_parser.error = UNINITIALIZED;
+    log_start();
+    parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    init(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
+  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
+    // If there are no structurals left, return EMPTY
+    if (at_end(parser.n_structural_indexes)) {
+      return parser.error = EMPTY;
     }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
+
+    init();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return on_error(DEPTH_ERROR);
+      return parser.error = DEPTH_ERROR;
     }
     return SUCCESS;
   }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
-  }
-};
-
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
-
-} // namespace stage2
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
-  if (result) { return result; }
-
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
-
-//
-// Object parser states
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    parser.increment_count();
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
-
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.increment_count();
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_state;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
-
-//
-// Array parser states
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
+  really_inline void log_value(const char *type) {
+    logger::log_line(*this, "", type, "");
   }
-  parser.increment_count();
 
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
-
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.increment_count();
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
+  static really_inline void log_start() {
+    logger::log_start();
   }
 
-finish:
-  return parser.finish();
-
-error:
-  return parser.error();
-}
-
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
+  really_inline void log_start_value(const char *type) {
+    logger::log_line(*this, "+", type, "");
+    if (logger::LOG_ENABLED) { logger::log_depth++; }
   }
-  return code;
-}
-/* end file src/generic/stage2/structural_parser.h */
-/* begin file src/generic/stage2/streaming_structural_parser.h */
-namespace stage2 {
-
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    init(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
+  really_inline void log_end_value(const char *type) {
+    if (logger::LOG_ENABLED) { logger::log_depth--; }
+    logger::log_line(*this, "-", type, "");
   }
 
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
+  really_inline void log_error(const char *error) {
+    logger::log_line(*this, "", "ERROR", error);
   }
-};
+}; // struct structural_parser
 
-} // namespace stage2
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
+template<bool STREAMING>
+WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
-  error_code result = parser.start(len, addresses.finish);
+  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  error_code result = parser.start(addresses.finish);
   if (result) { return result; }
+
   //
   // Read first value
   //
-  switch (parser.structurals.current_char()) {
+  switch (parser.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
+    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
+        goto error;
+      }
+    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
+  case 't':
+    parser.log_value("true");
+    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+    goto finish;
+  case 'f':
+    parser.log_value("false");
+    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+    goto finish;
+  case 'n':
+    parser.log_value("null");
+    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::NULL_VALUE);
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
+    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
 //
-// Object parser parsers
+// Object parser states
 //
 object_begin:
   switch (parser.advance_char()) {
   case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
+    parser.increment_count();
+    FAIL_IF( parser.parse_string(true) );
+    goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("Object does not start with a key");
     goto error;
   }
 
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.increment_count();
-  parser.advance_char();
+object_key_state:
+  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
+    parser.increment_count();
+    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
+    FAIL_IF( parser.parse_string(true) );
+    goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  CONTINUE( parser.parser.ret_address[parser.depth] );
 
 //
-// Array parser parsers
+// Array parser states
 //
 array_begin:
-  if (parser.advance_char() == ']') {
+  if (parser.peek_next_char() == ']') {
+    parser.advance_char();
     parser.end_array();
     goto scope_end;
   }
@@ -10144,31 +10560,208 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
+    parser.log_error("Missing comma between array values");
     goto error;
   }
 
 finish:
-  next_json = parser.structurals.next_structural_index();
   return parser.finish();
 
 error:
   return parser.error();
 }
-/* end file src/generic/stage2/streaming_structural_parser.h */
+
+} // namespace {}
+} // namespace stage2
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  error_code result = stage2::parse_structurals<false>(*this, _doc);
+  if (result) { return result; }
+
+  // If we didn't make it to the end, it's an error
+  if ( next_structural_index != n_structural_indexes ) {
+    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return error = TAPE_ERROR;
+  }
+
+  return SUCCESS;
+}
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::parse_structurals<true>(*this, _doc);
+}
+/* end file src/generic/stage2/tape_writer.h */
+
+WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  error_code err = stage1(_buf, _len, false);
+  if (err) { return err; }
+  return stage2(_doc);
+}
 
 } // namespace haswell
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage2/streaming_structural_parser.h */
+/* end file src/generic/stage2/tape_writer.h */
 #endif
 #if SIMDJSON_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/stage1.cpp */
+/* begin file src/westmere/implementation.cpp */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+/* begin file src/westmere/dom_parser_implementation.h */
+#ifndef SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
+
+/* isadetection.h already included: #include "isadetection.h" */
+
+namespace simdjson {
+namespace westmere {
+
+/* begin file src/generic/dom_parser_implementation.h */
+// expectation: sizeof(scope_descriptor) = 64/8.
+struct scope_descriptor {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+typedef void* ret_address_t;
+#else
+typedef char ret_address_t;
+#endif
+
+class dom_parser_implementation final : public internal::dom_parser_implementation {
+public:
+  /** Tape location of each open { or [ */
+  std::unique_ptr<scope_descriptor[]> containing_scope{};
+  /** Return address of each open { or [ */
+  std::unique_ptr<ret_address_t[]> ret_address{};
+  /** Buffer passed to stage 1 */
+  const uint8_t *buf{};
+  /** Length passed to stage 1 */
+  size_t len{0};
+  /** Document passed to stage 2 */
+  dom::document *doc{};
+  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
+  error_code error{UNINITIALIZED};
+
+  really_inline dom_parser_implementation();
+  dom_parser_implementation(const dom_parser_implementation &) = delete;
+  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
+
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
+  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
+  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
+  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
+  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
+};
+
+/* begin file src/generic/stage1/allocate.h */
+namespace stage1 {
+namespace allocate {
+
+//
+// Allocates stage 1 internal state and outputs in the parser
+//
+really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
+  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
+  if (!parser.structural_indexes) { return MEMALLOC; }
+  parser.structural_indexes[0] = 0;
+  parser.n_structural_indexes = 0;
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage1
+/* end file src/generic/stage1/allocate.h */
+/* begin file src/generic/stage2/allocate.h */
+namespace stage2 {
+namespace allocate {
+
+//
+// Allocates stage 2 internal state and outputs in the parser
+//
+really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
+  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
+  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
+
+  if (!parser.ret_address || !parser.containing_scope) {
+    return MEMALLOC;
+  }
+  return SUCCESS;
+}
+
+} // namespace allocate
+} // namespace stage2
+/* end file src/generic/stage2/allocate.h */
+
+really_inline dom_parser_implementation::dom_parser_implementation() {}
+
+// Leaving these here so they can be inlined if so desired
+WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
+  error_code err = stage1::allocate::set_capacity(*this, capacity);
+  if (err) { _capacity = 0; return err; }
+  _capacity = capacity;
+  return SUCCESS;
+}
+
+WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
+  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
+  if (err) { _max_depth = 0; return err; }
+  _max_depth = max_depth;
+  return SUCCESS;
+}
+/* end file src/generic/stage2/allocate.h */
+
+} // namespace westmere
+} // namespace simdjson
+
+#endif // SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
+/* end file src/generic/stage2/allocate.h */
+
+TARGET_HASWELL
+
+namespace simdjson {
+namespace westmere {
+
+WARN_UNUSED error_code implementation::create_dom_parser_implementation(
+  size_t capacity,
+  size_t max_depth,
+  std::unique_ptr<internal::dom_parser_implementation>& dst
+) const noexcept {
+  dst.reset( new (std::nothrow) dom_parser_implementation() );
+  if (!dst) { return MEMALLOC; }
+  dst->set_capacity(capacity);
+  dst->set_max_depth(max_depth);
+  return SUCCESS;
+}
+
+} // namespace westmere
+} // namespace simdjson
+
+UNTARGET_REGION
+/* end file src/generic/stage2/allocate.h */
+/* begin file src/westmere/dom_parser_implementation.cpp */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+/* westmere/dom_parser_implementation.h already included: #include "westmere/dom_parser_implementation.h" */
+
+//
+// Stage 1
+//
 /* begin file src/westmere/bitmask.h */
 #ifndef SIMDJSON_WESTMERE_BITMASK_H
 #define SIMDJSON_WESTMERE_BITMASK_H
@@ -10739,24 +11332,21 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-  really_inline size_t block_index() { return idx; }
-  really_inline bool has_full_block() const {
-    return idx < lenminusstep;
-  }
-  really_inline const uint8_t *full_block() const {
-    return &buf[idx];
-  }
-  really_inline bool has_remainder() const {
-    return idx < len;
-  }
-  really_inline void get_remainder(uint8_t *tmp_buf) const {
-    memset(tmp_buf, 0x20, STEP_SIZE);
-    memcpy(tmp_buf, buf + idx, len - idx);
-  }
-  really_inline void advance() {
-    idx += STEP_SIZE;
-  }
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  really_inline size_t block_index();
+  really_inline bool has_full_block() const;
+  really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  really_inline size_t get_remainder(uint8_t *dst) const;
+  really_inline void advance();
 private:
   const uint8_t *buf;
   const size_t len;
@@ -10764,6 +11354,18 @@ struct buf_block_reader {
   size_t idx;
 };
 
+constexpr const int TITLE_SIZE = 12;
+
+// Routines to print masks and text for debugging bitmask operations
+UNUSED static char * format_input_text_64(const uint8_t *text) {
+  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
+  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -10783,6 +11385,34 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
+
+template<size_t STEP_SIZE>
+really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
+
+template<size_t STEP_SIZE>
+really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template<size_t STEP_SIZE>
+really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template<size_t STEP_SIZE>
+really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+  memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template<size_t STEP_SIZE>
+really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -11082,13 +11712,15 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
+
+  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
+  // Index the last (remainder) block, padded with spaces
+  uint8_t block[STEP_SIZE];
+  if (likely(reader.get_remainder(block)) > 0) {
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -11101,6 +11733,94 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
+/* begin file src/generic/stage1/find_next_document_index.h */
+/**
+  * This algorithm is used to quickly identify the last structural position that
+  * makes up a complete document.
+  *
+  * It does this by going backwards and finding the last *document boundary* (a
+  * place where one value follows another without a comma between them). If the
+  * last document (the characters after the boundary) has an equal number of
+  * start and end brackets, it is considered complete.
+  *
+  * Simply put, we iterate over the structural characters, starting from
+  * the end. We consider that we found the end of a JSON document when the
+  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+  *
+  * This simple comparison works most of the time, but it does not cover cases
+  * where the batch's structural indexes contain a perfect amount of documents.
+  * In such a case, we do not have access to the structural index which follows
+  * the last document, therefore, we do not have access to the second element in
+  * the pair, and that means we cannot identify the last document. To fix this
+  * issue, we keep a count of the open and closed curly/square braces we found
+  * while searching for the pair. When we find a pair AND the count of open and
+  * closed curly/square braces is the same, we know that we just passed a
+  * complete document, therefore the last json buffer location is the end of the
+  * batch.
+  */
+really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
+  // TODO don't count separately, just figure out depth
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (parser.buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (parser.buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    // Last document is complete, so the next document will appear after!
+    if (!arr_cnt && !obj_cnt) {
+      return parser.n_structural_indexes;
+    }
+    // Last document is incomplete; mark the document at i + 1 as the next one
+    return i;
+  }
+  return 0;
+}
+
+// Skip the last character if it is partial
+really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
+  if (unlikely(len < 3)) {
+    switch (len) {
+      case 2:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
+        return len;
+      case 1:
+        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+        return len;
+      case 0:
+        return len;
+    }
+  }
+  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
+  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
+  return len;
+}
+/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -11151,9 +11871,9 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//   
+//
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-// 
+//
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -11168,6 +11888,7 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 using namespace simd;
 
 namespace utf8_validation {
+  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -11212,7 +11933,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,                          
+      0, 0, 0, 0,
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -11243,214 +11964,6 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
-  //
-  // Validate the length of multibyte characters (that each multibyte character has the right number
-  // of continuation characters, and that all continuation characters are part of a multibyte
-  // character).
-  //
-  // Algorithm
-  // =========
-  //
-  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
-  // and emits an error anytime there is a mismatch.
-  //
-  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
-  // characters, the file will look like this:
-  //
-  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
-  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
-  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
-  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
-  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
-  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
-  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
-  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
-  //
-  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
-  //
-  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
-  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
-  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
-  //   or maybe it's at the beginning of the file before any characters have started; but it's an
-  //   error in all these cases.
-  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
-  //   we started a new character before we were finished with the current one.
-  //
-  // Getting the Previous Bytes
-  // --------------------------
-  //
-  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
-  // character, we need to "shift the bytes" to find that out. This is what they mean:
-  //
-  // - `is_continuation`: if the current byte is a continuation.
-  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
-  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
-  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
-  //
-  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
-  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
-  // function, because the 1-byte-back data is used by other checks as well.
-  //
-  // Getting the Continuation Mask
-  // -----------------------------
-  //
-  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
-  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
-  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
-  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
-  //
-  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
-  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
-  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
-  //
-  // When treated as signed numbers, they look like this:
-  //
-  // | Type         | High Bits  | Binary Range | Signed |
-  // |--------------|------------|--------------|--------|
-  // | ASCII        | `0`        | `01111111`   |   127  |
-  // |              |            | `00000000`   |     0  |
-  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
-  // |              |            | `11110000    |   -16  |
-  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
-  // |              |            | `11100000    |   -32  |
-  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
-  // |              |            | `11000000    |   -64  |
-  // | Continuation | `10`       | `10111111`   |   -65  |
-  // |              |            | `10000000    |  -128  |
-  //
-  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
-  //
-  // ```
-  // is_continuation = input < -64`
-  // ```
-  //
-  // We can do something similar for the others, but it takes two comparisons instead of one: "is
-  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
-  // `> -64`. Surely we can do better, they're right next to each other!
-  //
-  // Getting the is_xxx Masks: Shifting the Range
-  // --------------------------------------------
-  //
-  // Notice *why* continuations were a single comparison. The actual *range* would require two
-  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
-  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
-  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
-  //
-  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
-  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
-  //
-  // | Type                 | High Bits  | Binary Range | Signed |
-  // |----------------------|------------|--------------|-------|
-  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
-  // |                      |            | `01110000    |   112 |
-  // |----------------------|------------|--------------|-------|
-  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
-  // |                      |            | `01100000    |    96 |
-  // |----------------------|------------|--------------|-------|
-  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
-  // |                      |            | `01000000    |    64 |
-  // |----------------------|------------|--------------|-------|
-  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
-  // |                      |            | `00000000    |     0 |
-  // |----------------------|------------|--------------|-------|
-  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
-  // |                      |            | `10000000`   |  -128 |
-  // |----------------------|------------|--------------|-------|
-  // 
-  // *Now* we can use signed `>` on all of them:
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev2 = input.prev<2>
-  // prev3 = input.prev<3>
-  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
-  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
-  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
-  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
-  // ```
-  //
-  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
-  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
-  // `^`'s at a time on Haswell, but only 2 `+`'s).
-  //
-  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
-  // same number to all of them, we can save one of those `+ 128` operations by assembling
-  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
-  // to it. One more instruction saved!
-  //
-  // ```
-  // prev1 = input.prev<1>
-  // prev3 = input.prev<3>
-  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
-  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
-  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
-  // ```
-  //
-  // ### Bringing It All Together: Detecting the Errors
-  //
-  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
-  // All we have left to do is check if they match!
-  //
-  // ```
-  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
-  // ```
-  //
-  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
-  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
-  // bitwise operations, and we're only using 1!
-  //
-  // Epilogue: Addition For Booleans
-  // -------------------------------
-  //
-  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
-  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
-  // to each other (or any combination), and the continuation could be part of either of them!
-  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
-  //
-  // Never fear, though. If that situation occurs, we'll already have detected that the second
-  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
-  // character, but it *wasn't a continuation*.
-  //
-  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
-  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
-  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
-  // comparisons were giving us numbers!
-  //
-  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
-  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
-  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
-  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
-  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
-  //
-  // Further, if *more than one* multibyte character overlaps,
-  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
-  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
-  // considered an error.
-  //
-  // One reason you might want to do this is parallelism. ^ and | are not associative, so
-  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
-  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
-  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
-  // then adds the result together. Same number of operations, but if the processor can run
-  // independent things in parallel (which most can), it runs faster.
-  //
-  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
-  // a super nice advantage in that more of them can be run at the same time (they can run on 3
-  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
-  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
-  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
-  // out of the way," running on a port other instructions can't.
-  // 
-  // Epilogue II: One More Trick
-  // ---------------------------
-  //
-  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
-  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
-  // check_special_cases()--but we'll talk about that there :)
-  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -11588,16 +12101,22 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
+  /**
+   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+   *
+   * @param partial Setting the partial parameter to true allows the find_structural_bits to
+   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
+   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
+   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes)
-  : indexer{structural_indexes} {}
+  really_inline json_structural_indexer(uint32_t *structural_indexes);
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
+  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -11606,42 +12125,44 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
+really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
 
-really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
+  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+  if (partial) { len = trim_partial_utf8(buf, len); }
 
-  error_code error = scanner.finish(streaming);
-  if (unlikely(error != SUCCESS)) { return error; }
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
+  // Read all but the last block
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /* a valid JSON file cannot have zero structural indexes - we should have
-   * found something */
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
-    /* the string might not be NULL terminated, but we add a virtual NULL
-     * ending character. */
-    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
-  }
-  /* make it safe to dereference one beyond this array */
-  parser.structural_indexes[parser.n_structural_indexes] = 0;
-  return checker.errors();
+  // Take care of the last block (will always be there unless file is empty)
+  uint8_t block[STEP_SIZE];
+  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
+  indexer.step<STEP_SIZE>(block, reader);
+
+  return indexer.finish(parser, reader.block_index(), len, partial);
 }
 
 template<>
@@ -11663,60 +12184,76 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
   reader.advance();
 }
 
-//
-// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-//
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
-// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
-// you may want to call on a function like trimmed_length_safe_utf8.
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+
+  error_code error = scanner.finish(partial);
+  if (unlikely(error != SUCCESS)) { return error; }
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
   }
 
-  if (likely(reader.has_remainder())) {
-    uint8_t block[STEP_SIZE];
-    reader.get_remainder(block);
-    indexer.step<STEP_SIZE>(block, reader);
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /***
+   * This is related to https://github.com/simdjson/simdjson/issues/906
+   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
+   * structural character, it quickly stops.
+   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
+   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
+   * continues, then it must be [,] or }.
+   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
+   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
+   * It can't be a comma, a colon or any simple value. So the only way we could continue is
+   * if the repeated character is [. But if so, the document must start with [. But if the document
+   * starts with [, it should end with ]. If we enforce that rule, then we would get
+   * ][[ which is invalid.
+   **/
+  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
+  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
+  parser.next_structural_index = 0;
+  // a valid JSON file cannot have zero structural indexes - we should have found something
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
   }
-
-  return indexer.finish(parser, reader.block_index(), len, streaming);
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (partial) {
+    auto new_structural_indexes = find_next_document_index(parser);
+    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
+      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
+    }
+    parser.n_structural_indexes = new_structural_indexes;
+  }
+  return checker.errors();
 }
 
 } // namespace stage1
 /* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
-  return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
+WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
+  this->buf = _buf;
+  this->len = _len;
+  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
 }
 
 } // namespace westmere
-
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage1/json_structural_indexer.h */
-/* begin file src/westmere/stage2.cpp */
-/* westmere/implementation.h already included: #include "westmere/implementation.h" */
+
+//
+// Stage 2
+//
 /* begin file src/westmere/stringparsing.h */
 #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H
 #define SIMDJSON_WESTMERE_STRINGPARSING_H
@@ -12130,10 +12667,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -12153,26 +12690,6 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
-// We need to check that the character following a zero is valid. This is
-// probably frequent and it is harder than it looks. We are building all of this
-// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
-const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
-    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-really_inline bool
-is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
-  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
-}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -12250,14 +12767,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.write_s64(signed_answer);
+      writer.append_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -12270,12 +12787,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.write_s64(i);
+      writer.append_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.write_u64(i);
+      writer.append_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -12285,7 +12802,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.write_double(d);
+    writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -12309,10 +12826,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W writer) {
+                                W &writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.write_s64(0);        // always write zero
+  writer.append_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -12332,7 +12849,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
+    if (is_integer(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -12456,7 +12973,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (digit_count >= 19) {
+      if (unlikely(digit_count >= 19)) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -12464,14 +12981,22 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        return slow_float_parsing((const char *) src, writer);
+        bool success = slow_float_parsing((const char *) src, writer);
+        // The number was already written, but we made a copy of the writer
+        // when we passed it to the parse_large_integer() function, so 
+        writer.skip_double();
+        return success;
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      return slow_float_parsing((const char *) src, writer);
+      bool success = slow_float_parsing((const char *) src, writer);
+      // The number was already written, but we made a copy of the writer when we passed it to the
+      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
+      writer.skip_double();
+      return success;
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -12480,7 +13005,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.write_double(d);
+      writer.append_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -12495,10 +13020,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      return parse_large_integer(src, writer, found_minus);
+      bool success = parse_large_integer(src, writer, found_minus);
+      // The number was already written, but we made a copy of the writer
+      // when we passed it to the parse_large_integer() function, so 
+      writer.skip_large_integer();
+      return success;
     }
     i = negative ? 0 - i : i;
-    writer.write_s64(i);
+    writer.append_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -12523,6 +13052,72 @@ TARGET_WESTMERE
 namespace simdjson {
 namespace westmere {
 
+/* begin file src/generic/stage2/logger.h */
+// This is for an internal-only stage 2 specific logger.
+// Set LOG_ENABLED = true to log what stage 2 is doing!
+namespace logger {
+  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
+
+  static constexpr const bool LOG_ENABLED = false;
+  static constexpr const int LOG_EVENT_LEN = 30;
+  static constexpr const int LOG_BUFFER_LEN = 20;
+  static constexpr const int LOG_DETAIL_LEN = 50;
+  static constexpr const int LOG_INDEX_LEN = 10;
+
+  static int log_depth; // Not threadsafe. Log only.
+
+  // Helper to turn unprintable or newline characters into spaces
+  static really_inline char printable_char(char c) {
+    if (c >= 0x20) {
+      return c;
+    } else {
+      return ' ';
+    }
+  }
+
+  // Print the header and set up log_start
+  static really_inline void log_start() {
+    if (LOG_ENABLED) {
+      log_depth = 0;
+      printf("\n");
+      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
+      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
+    }
+  }
+
+  static really_inline void log_string(const char *message) {
+    if (LOG_ENABLED) {
+      printf("%s\n", message);
+    }
+  }
+
+  // Logs a single line of 
+  template<typename S>
+  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
+    if (LOG_ENABLED) {
+      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
+      {
+        // Print the next N characters in the buffer.
+        printf("| ");
+        // Otherwise, print the characters starting from the buffer position.
+        // Print spaces for unprintable or newline characters.
+        for (int i=0;i<LOG_BUFFER_LEN;i++) {
+          printf("%c", printable_char(structurals.current()[i]));
+        }
+        printf(" ");
+      }
+      printf("|    %c ", printable_char(structurals.current_char()));
+      printf("|    %c ", printable_char(structurals.peek_next_char()));
+      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
+      printf("| %5u ", structurals.next_tape_index());
+      printf("| %-*s ", LOG_DETAIL_LEN, detail);
+      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
+      printf("|\n");
+    }
+  }
+} // namespace logger
+
+/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -12581,26 +13176,34 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
-    : buf{_buf},
-     len{_len},
-     structural_indexes{_structural_indexes},
-     next_structural{next_structural_index}
-    {}
-  really_inline char advance_char() {
-    idx = structural_indexes[next_structural];
-    next_structural++;
-    c = *current();
-    return c;
+  const uint8_t* const buf;
+  uint32_t *current_structural;
+  dom_parser_implementation &parser;
+
+  // Start a structural 
+  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
+    : buf{_parser.buf},
+      current_structural{&_parser.structural_indexes[start_structural_index]},
+      parser{_parser} {
+  }
+  // Get the buffer position of the current structural character
+  really_inline const uint8_t* current() {
+    return &buf[*current_structural];
   }
+  // Get the current structural character
   really_inline char current_char() {
-    return c;
+    return buf[*current_structural];
   }
-  really_inline const uint8_t* current() {
-    return &buf[idx];
+  // Get the next structural character without advancing
+  really_inline char peek_next_char() {
+    return buf[*(current_structural+1)];
+  }
+  really_inline char advance_char() {
+    current_structural++;
+    return buf[*current_structural];
   }
   really_inline size_t remaining_len() {
-    return len - idx;
+    return parser.len - *current_structural;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -12617,32 +13220,25 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, len);
-    memset(copy + len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
+    memcpy(copy, buf, parser.len);
+    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return next_structural+1 > n_structural_indexes;
+    return current_structural >= &parser.structural_indexes[n_structural_indexes];
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return next_structural+1 == n_structural_indexes;
+    return current_structural == &parser.structural_indexes[n_structural_indexes];
   }
-  really_inline size_t next_structural_index() {
-    return next_structural;
+  really_inline bool at_beginning() {
+    return current_structural == parser.structural_indexes.get();
   }
-
-  const uint8_t* const buf;
-  const size_t len;
-  const uint32_t* const structural_indexes;
-  size_t next_structural; // next structural index
-  size_t idx{0}; // location of the structural character in the input (buf)
-  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -12654,8 +13250,105 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
+namespace { // Make everything here private
+
+/* begin file src/generic/stage2/tape_writer.h */
+struct tape_writer {
+  /** The next place to write to tape */
+  uint64_t *next_tape_loc;
+  
+  /** Write a signed 64-bit value to tape. */
+  really_inline void append_s64(int64_t value) noexcept;
+
+  /** Write an unsigned 64-bit value to tape. */
+  really_inline void append_u64(uint64_t value) noexcept;
+
+  /** Write a double value to tape. */
+  really_inline void append_double(double value) noexcept;
+
+  /**
+   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
+   */
+  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
+
+  /**
+   * Skip the current tape entry without writing.
+   *
+   * Used to skip the start of the container, since we'll come back later to fill it in when the
+   * container ends.
+   */
+  really_inline void skip() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a large u64 or i64.
+   */
+  really_inline void skip_large_integer() noexcept;
+
+  /**
+   * Skip the number of tape entries necessary to write a double.
+   */
+  really_inline void skip_double() noexcept;
+
+  /**
+   * Write a value to a known location on tape.
+   *
+   * Used to go back and write out the start of a container after the container ends.
+   */
+  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
+
+private:
+  /**
+   * Append both the tape entry, and a supplementary value following it. Used for types that need
+   * all 64 bits, such as double and uint64_t.
+   */
+  template<typename T>
+  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
+}; // struct number_writer
+
+really_inline void tape_writer::append_s64(int64_t value) noexcept {
+  append2(0, value, internal::tape_type::INT64);
+}
+
+really_inline void tape_writer::append_u64(uint64_t value) noexcept {
+  append(0, internal::tape_type::UINT64);
+  *next_tape_loc = value;
+  next_tape_loc++;
+}
 
-using internal::ret_address;
+/** Write a double value to tape. */
+really_inline void tape_writer::append_double(double value) noexcept {
+  append2(0, value, internal::tape_type::DOUBLE);
+}
+
+really_inline void tape_writer::skip() noexcept {
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::skip_large_integer() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::skip_double() noexcept {
+  next_tape_loc += 2;
+}
+
+really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
+  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
+  next_tape_loc++;
+}
+
+template<typename T>
+really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
+  append(val, t);
+  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
+  memcpy(next_tape_loc, &val2, sizeof(val2));
+  next_tape_loc++;
+}
+
+really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
+  tape_loc = val | ((uint64_t(char(t))) << 56);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -12686,102 +13379,88 @@ using internal::ret_address;
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address array_begin;
-  ret_address array_continue;
-  ret_address error;
-  ret_address finish;
-  ret_address object_begin;
-  ret_address object_continue;
+  ret_address_t array_begin;
+  ret_address_t array_continue;
+  ret_address_t error;
+  ret_address_t finish;
+  ret_address_t object_begin;
+  ret_address_t object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct number_writer {
-  parser &doc_parser;
-  
-  really_inline void write_s64(int64_t value) noexcept {
-    write_tape(0, internal::tape_type::INT64);
-    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
-    ++doc_parser.current_loc;
-  }
-  really_inline void write_u64(uint64_t value) noexcept {
-    write_tape(0, internal::tape_type::UINT64);
-    doc_parser.doc.tape[doc_parser.current_loc++] = value;
-  }
-  really_inline void write_double(double value) noexcept {
-    write_tape(0, internal::tape_type::DOUBLE);
-    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
-    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
-    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
-  }
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
-  }
-}; // struct number_writer
-
-struct structural_parser {
-  structural_iterator structurals;
-  parser &doc_parser;
+struct structural_parser : structural_iterator {
+  /** Lets you append to the tape */
+  tape_writer tape;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc{};
-  uint32_t depth;
-
-  really_inline structural_parser(
-    const uint8_t *buf,
-    size_t len,
-    parser &_doc_parser,
-    uint32_t next_structural = 0
-  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
-
-  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
-    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
-    doc_parser.containing_scope[depth].count = 0;
-    write_tape(0, type); // if the document is correct, this gets rewritten later
-    doc_parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc;
+  /** Current depth (nested objects and arrays) */
+  uint32_t depth{0};
+
+  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
+  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
+    : structural_iterator(_parser, start_structural_index),
+      tape{parser.doc->tape.get()},
+      current_string_buf_loc{parser.doc->string_buf.get()} {
+  }
+
+  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
+    parser.containing_scope[depth].tape_index = next_tape_index();
+    parser.containing_scope[depth].count = 0;
+    tape.skip(); // We don't actually *write* the start element until the end.
+    parser.ret_address[depth] = continue_state;
     depth++;
-    return depth >= doc_parser.max_depth();
+    bool exceeded_max_depth = depth >= parser.max_depth();
+    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
+    return exceeded_max_depth;
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
-    return start_scope(internal::tape_type::ROOT, continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
+    log_start_value("document");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_OBJECT, continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
+    log_start_value("object");
+    return start_scope(continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
-    return start_scope(internal::tape_type::START_ARRAY, continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
+    log_start_value("array");
+    return start_scope(continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type type) noexcept {
+  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
     depth--;
-    // write our doc.tape location to the header scope
+    // write our doc->tape location to the header scope
     // The root scope gets written *at* the previous location.
-    write_tape(doc_parser.containing_scope[depth].tape_index, type);
+    tape.append(parser.containing_scope[depth].tape_index, end);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
-    const uint32_t count = doc_parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
+    const uint32_t count = parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
-    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
+    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
+    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
+  }
+
+  really_inline uint32_t next_tape_index() {
+    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
   }
 
   really_inline void end_object() {
-    end_scope(internal::tape_type::END_OBJECT);
+    log_end_value("object");
+    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    end_scope(internal::tape_type::END_ARRAY);
+    log_end_value("array");
+    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    end_scope(internal::tape_type::ROOT);
-  }
-
-  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
-    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+    log_end_value("document");
+    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -12789,17 +13468,16 @@ struct structural_parser {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    /* we advance the point, accounting for the fact that we have a NULL
-      * termination         */
-    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
+    // we advance the point, accounting for the fact that we have a NULL termination
+    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline bool on_end_string(uint8_t *dst) noexcept {
+  really_inline void on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -12809,73 +13487,49 @@ struct structural_parser {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
-    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string() {
+  WARN_UNUSED really_inline bool parse_string(bool key = false) {
+    log_value(key ? "key" : "string");
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(structurals.current(), dst);
+    dst = stringparsing::parse_string(current(), dst);
     if (dst == nullptr) {
+      log_error("Invalid escape in string");
       return true;
     }
-    return !on_end_string(dst);
+    on_end_string(dst);
+    return false;
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    number_writer writer{doc_parser};
-    return !numberparsing::parse_number(src, found_minus, writer);
+    log_value("number");
+    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
+    if (!succeeded) { log_error("Invalid number"); }
+    return !succeeded;
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(structurals.current(), found_minus);
-  }
-
-  WARN_UNUSED really_inline bool parse_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
-  }
-
-  WARN_UNUSED really_inline bool parse_single_atom() {
-    switch (structurals.current_char()) {
-      case 't':
-        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::TRUE_VALUE);
-        break;
-      case 'f':
-        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::FALSE_VALUE);
-        break;
-      case 'n':
-        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
-        write_tape(0, internal::tape_type::NULL_VALUE);
-        break;
-      default:
-        return true;
-    }
-    return false;
+    return parse_number(current(), found_minus);
   }
 
-  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
-    switch (structurals.current_char()) {
+  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
+    switch (advance_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't': case 'f': case 'n':
-      FAIL_IF( parse_atom() );
+    case 't':
+      log_value("true");
+      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
+      tape.append(0, internal::tape_type::TRUE_VALUE);
+      return continue_state;
+    case 'f':
+      log_value("false");
+      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
+      tape.append(0, internal::tape_type::FALSE_VALUE);
+      return continue_state;
+    case 'n':
+      log_value("null");
+      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
+      tape.append(0, internal::tape_type::NULL_VALUE);
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -12891,40 +13545,27 @@ struct structural_parser {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
+      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
-    // the string might not be NULL terminated.
-    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
     end_document();
+    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
+
     if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
+      log_error("Unclosed objects or arrays!");
+      return parser.error = TAPE_ERROR;
     }
 
-    return on_success(SUCCESS);
-  }
-
-  really_inline error_code on_error(error_code new_error_code) noexcept {
-    doc_parser.error = new_error_code;
-    return new_error_code;
-  }
-  really_inline error_code on_success(error_code success_code) noexcept {
-    doc_parser.error = success_code;
-    doc_parser.valid = true;
-    return success_code;
+    return SUCCESS;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by doc_parser.init_stage2(),
+    /* We do not need the next line because this is done by parser.init_stage2(),
     * pessimistically.
-    * doc_parser.is_valid  = false;
+    * parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -12932,12 +13573,12 @@ struct structural_parser {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= doc_parser.max_depth()) {
-      return on_error(DEPTH_ERROR);
+    if (depth >= parser.max_depth()) {
+      return parser.error = DEPTH_ERROR;
     }
-    switch (structurals.current_char()) {
+    switch (current_char()) {
     case '"':
-      return on_error(STRING_ERROR);
+      return parser.error = STRING_ERROR;
     case '0':
     case '1':
     case '2':
@@ -12949,92 +13590,124 @@ struct structural_parser {
     case '8':
     case '9':
     case '-':
-      return on_error(NUMBER_ERROR);
+      return parser.error = NUMBER_ERROR;
     case 't':
-      return on_error(T_ATOM_ERROR);
+      return parser.error = T_ATOM_ERROR;
     case 'n':
-      return on_error(N_ATOM_ERROR);
+      return parser.error = N_ATOM_ERROR;
     case 'f':
-      return on_error(F_ATOM_ERROR);
+      return parser.error = F_ATOM_ERROR;
     default:
-      return on_error(TAPE_ERROR);
+      return parser.error = TAPE_ERROR;
     }
   }
 
   really_inline void init() {
-    current_string_buf_loc = doc_parser.doc.string_buf.get();
-    doc_parser.current_loc = 0;
-    doc_parser.valid = false;
-    doc_parser.error = UNINITIALIZED;
+    log_start();
+    parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
-    init(); // sets is_valid to false
-    if (len > doc_parser.capacity()) {
-      return CAPACITY;
+  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
+    // If there are no structurals left, return EMPTY
+    if (at_end(parser.n_structural_indexes)) {
+      return parser.error = EMPTY;
     }
-    // Advance to the first character as soon as possible
-    structurals.advance_char();
+
+    init();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return on_error(DEPTH_ERROR);
+      return parser.error = DEPTH_ERROR;
     }
     return SUCCESS;
   }
 
-  really_inline char advance_char() {
-    return structurals.advance_char();
+  really_inline void log_value(const char *type) {
+    logger::log_line(*this, "", type, "");
   }
-};
+
+  static really_inline void log_start() {
+    logger::log_start();
+  }
+
+  really_inline void log_start_value(const char *type) {
+    logger::log_line(*this, "+", type, "");
+    if (logger::LOG_ENABLED) { logger::log_depth++; }
+  }
+
+  really_inline void log_end_value(const char *type) {
+    if (logger::LOG_ENABLED) { logger::log_depth--; }
+    logger::log_line(*this, "-", type, "");
+  }
+
+  really_inline void log_error(const char *error) {
+    logger::log_line(*this, "", "ERROR", error);
+  }
+}; // struct structural_parser
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-} // namespace stage2
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+template<bool STREAMING>
+WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
+  dom_parser.doc = &doc;
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(buf, len, doc_parser);
-  error_code result = parser.start(len, addresses.finish);
+  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
+  error_code result = parser.start(addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.structurals.current_char()) {
+  switch (parser.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
+    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
+    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
+    if (!STREAMING) {
+      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
+        goto error;
+      }
+    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
+  case 't':
+    parser.log_value("true");
+    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
+    goto finish;
+  case 'f':
+    parser.log_value("false");
+    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
+    goto finish;
+  case 'n':
+    parser.log_value("null");
+    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
+    parser.tape.append(0, internal::tape_type::NULL_VALUE);
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
+    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -13045,43 +13718,45 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string() );
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.advance_char();
+  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
+    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
+    FAIL_IF( parser.parse_string(true) );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
+    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+  CONTINUE( parser.parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.advance_char() == ']') {
+  if (parser.peek_next_char() == ']') {
+    parser.advance_char();
     parser.end_array();
     goto scope_end;
   }
@@ -13096,12 +13771,12 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
+    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -13112,176 +13787,46 @@ WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, pa
   return parser.error();
 }
 
-WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
-  error_code code = stage1(buf, len, doc_parser, false);
-  if (!code) {
-    code = stage2(buf, len, doc_parser);
-  }
-  return code;
-}
-/* end file src/generic/stage2/structural_parser.h */
-/* begin file src/generic/stage2/streaming_structural_parser.h */
-namespace stage2 {
-
-struct streaming_structural_parser: structural_parser {
-  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
-
-  // override to add streaming
-  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
-    init(); // sets is_valid to false
-    // Capacity ain't no thang for streaming, so we don't check it.
-    // Advance to the first character as soon as possible
-    advance_char();
-    // Push the root scope (there is always at least one scope)
-    if (start_document(finish_parser)) {
-      return on_error(DEPTH_ERROR);
-    }
-    return SUCCESS;
-  }
-
-  // override to add streaming
-  WARN_UNUSED really_inline error_code finish() {
-    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
-      return on_error(TAPE_ERROR);
-    }
-    end_document();
-    if (depth != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    if (doc_parser.containing_scope[depth].tape_index != 0) {
-      return on_error(TAPE_ERROR);
-    }
-    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
-    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
-  }
-};
-
+} // namespace {}
 } // namespace stage2
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
-  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
-  error_code result = parser.start(len, addresses.finish);
+WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
+  error_code result = stage2::parse_structurals<false>(*this, _doc);
   if (result) { return result; }
-  //
-  // Read first value
-  //
-  switch (parser.structurals.current_char()) {
-  case '{':
-    FAIL_IF( parser.start_object(addresses.finish) );
-    goto object_begin;
-  case '[':
-    FAIL_IF( parser.start_array(addresses.finish) );
-    goto array_begin;
-  case '"':
-    FAIL_IF( parser.parse_string() );
-    goto finish;
-  case 't': case 'f': case 'n':
-    FAIL_IF( parser.parse_single_atom() );
-    goto finish;
-  case '0': case '1': case '2': case '3': case '4':
-  case '5': case '6': case '7': case '8': case '9':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], false);
-      })
-    );
-    goto finish;
-  case '-':
-    FAIL_IF(
-      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
-        return parser.parse_number(&copy[idx], true);
-      })
-    );
-    goto finish;
-  default:
-    goto error;
-  }
 
-//
-// Object parser parsers
-//
-object_begin:
-  switch (parser.advance_char()) {
-  case '"': {
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  }
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-object_key_parser:
-  FAIL_IF( parser.advance_char() != ':' );
-  parser.increment_count();
-  parser.advance_char();
-  GOTO( parser.parse_value(addresses, addresses.object_continue) );
-
-object_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    FAIL_IF( parser.advance_char() != '"' );
-    FAIL_IF( parser.parse_string() );
-    goto object_key_parser;
-  case '}':
-    parser.end_object();
-    goto scope_end;
-  default:
-    goto error;
-  }
-
-scope_end:
-  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
-
-//
-// Array parser parsers
-//
-array_begin:
-  if (parser.advance_char() == ']') {
-    parser.end_array();
-    goto scope_end;
+  // If we didn't make it to the end, it's an error
+  if ( next_structural_index != n_structural_indexes ) {
+    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
+    return error = TAPE_ERROR;
   }
-  parser.increment_count();
-
-main_array_switch:
-  /* we call update char on all paths in, so we can peek at parser.c on the
-   * on paths that can accept a close square brace (post-, and at start) */
-  GOTO( parser.parse_value(addresses, addresses.array_continue) );
 
-array_continue:
-  switch (parser.advance_char()) {
-  case ',':
-    parser.increment_count();
-    parser.advance_char();
-    goto main_array_switch;
-  case ']':
-    parser.end_array();
-    goto scope_end;
-  default:
-    goto error;
-  }
+  return SUCCESS;
+}
 
-finish:
-  next_json = parser.structurals.next_structural_index();
-  return parser.finish();
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
+  return stage2::parse_structurals<true>(*this, _doc);
+}
+/* end file src/generic/stage2/tape_writer.h */
 
-error:
-  return parser.error();
+WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
+  error_code err = stage1(_buf, _len, false);
+  if (err) { return err; }
+  return stage2(_doc);
 }
-/* end file src/generic/stage2/streaming_structural_parser.h */
 
 } // namespace westmere
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage2/streaming_structural_parser.h */
+/* end file src/generic/stage2/tape_writer.h */
 #endif
 
 SIMDJSON_POP_DISABLE_WARNINGS
-/* end file src/generic/stage2/streaming_structural_parser.h */
+/* end file src/generic/stage2/tape_writer.h */
diff --git a/inst/include/simdjson.h b/inst/include/simdjson.h
index 0a1d140..21efa8e 100644
--- a/inst/include/simdjson.h
+++ b/inst/include/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
+/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
@@ -2030,7 +2030,6 @@ namespace simdjson {
  */
 enum error_code {
   SUCCESS = 0,              ///< No error
-  SUCCESS_AND_HAS_MORE,     ///< @private No error and buffer still has more data
   CAPACITY,                 ///< This parser can't support a document that big
   MEMALLOC,                 ///< Error allocating memory, most likely out of memory
   TAPE_ERROR,               ///< Something went wrong while writing to the tape (stage 2), this is a generic error
@@ -2409,6 +2408,187 @@ inline char *allocate_padded_buffer(size_t length) noexcept;
 #ifndef SIMDJSON_IMPLEMENTATION_H
 #define SIMDJSON_IMPLEMENTATION_H
 
+/* begin file include/simdjson/internal/dom_parser_implementation.h */
+#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+
+#include <memory>
+
+namespace simdjson {
+
+namespace dom {
+class document;
+} // namespace dom
+
+namespace internal {
+
+/**
+ * An implementation of simdjson's DOM parser for a particular CPU architecture.
+ *
+ * This class is expected to be accessed only by pointer, and never move in memory (though the
+ * pointer can move).
+ */
+class dom_parser_implementation {
+public:
+
+  /**
+   * @private For internal implementation use
+   *
+   * Run a full JSON parse on a single document (stage1 + stage2).
+   * 
+   * Guaranteed only to be called when capacity > document length.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len The length of the json document.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 1 of the document parser.
+   * 
+   * Guaranteed only to be called when capacity > document length.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf The json document to parse.
+   * @param len The length of the json document.
+   * @param streaming Whether this is being called by parser::parse_many.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser.
+   * 
+   * Called after stage1().
+   *
+   * Overridden by each implementation.
+   *
+   * @param doc The document to output to.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code stage2(dom::document &doc) noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser for parser::parse_many.
+   *
+   * Guaranteed only to be called after stage1().
+   * Overridden by each implementation.
+   *
+   * @param doc The document to output to.
+   * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed.
+   */
+  WARN_UNUSED virtual error_code stage2_next(dom::document &doc) noexcept = 0;
+
+  /**
+   * Change the capacity of this parser.
+   * 
+   * Generally used for reallocation.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  virtual error_code set_capacity(size_t capacity) noexcept = 0;
+
+  /**
+   * Change the max depth of this parser.
+   *
+   * Generally used for reallocation.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth.
+   * @return The error code, or SUCCESS if there was no error.
+   */
+  virtual error_code set_max_depth(size_t max_depth) noexcept = 0;
+
+  /**
+   * Deallocate this parser.
+   */
+  virtual ~dom_parser_implementation() = default;
+
+  /** Number of structural indices passed from stage 1 to stage 2 */
+  uint32_t n_structural_indexes{0};
+  /** Structural indices passed from stage 1 to stage 2 */
+  std::unique_ptr<uint32_t[]> structural_indexes{};
+  /** Next structural index to parse */
+  uint32_t next_structural_index{0};
+
+  /**
+   * The largest document this parser can support without reallocating.
+   *
+   * @return Current capacity, in bytes.
+   */
+  really_inline size_t capacity() const noexcept;
+
+  /**
+   * The maximum level of nested object and arrays supported by this parser.
+   *
+   * @return Maximum depth, in bytes.
+   */
+  really_inline size_t max_depth() const noexcept;
+
+  /**
+   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
+   * and `max_depth` depth.
+   *
+   * @param capacity The new capacity.
+   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
+   * @return The error, if there is one.
+   */
+  WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth) noexcept;
+
+protected:
+  /**
+   * The maximum document length this parser supports.
+   *
+   * Buffers are large enough to handle any document up to this length.
+   */
+  size_t _capacity{0};
+
+  /**
+   * The maximum depth (number of nested objects and arrays) supported by this parser.
+   *
+   * Defaults to DEFAULT_MAX_DEPTH.
+   */
+  size_t _max_depth{0};
+}; // class dom_parser_implementation
+
+really_inline size_t dom_parser_implementation::capacity() const noexcept {
+  return _capacity;
+}
+
+really_inline size_t dom_parser_implementation::max_depth() const noexcept {
+  return _max_depth;
+}
+
+WARN_UNUSED
+inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept {
+  if (this->max_depth() != max_depth) {
+    error_code err = set_max_depth(max_depth);
+    if (err) { return err; }
+  }
+  if (_capacity != capacity) {
+    error_code err = set_capacity(capacity);
+    if (err) { return err; }
+  }
+  return SUCCESS;
+}
+
+} // namespace internal
+} // namespace simdjson
+
+#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
+/* end file include/simdjson/internal/dom_parser_implementation.h */
 #include <optional>
 #include <string>
 #include <atomic>
@@ -2417,8 +2597,8 @@ inline char *allocate_padded_buffer(size_t length) noexcept;
 namespace simdjson {
 
 namespace dom {
-  class parser;
-}
+  class document;
+} // namespace dom
 
 /**
  * An implementation of simdjson for a particular CPU architecture.
@@ -2461,16 +2641,19 @@ class implementation {
   /**
    * @private For internal implementation use
    *
-   * Run a full document parse (ensure_capacity, stage1 and stage2).
-   *
-   * Overridden by each implementation.
+   *     const implementation *impl = simdjson::active_implementation;
+   *     cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
    *
-   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
-   * @param len the length of the json document.
-   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
-   * @return the error code, or SUCCESS if there was no error.
+   * @param capacity The largest document that will be passed to the parser.
+   * @param max_depth The maximum JSON object/array nesting this parser is expected to handle.
+   * @param dst The place to put the resulting parser implementation.
+   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
    */
-  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0;
+  virtual error_code create_dom_parser_implementation(
+    size_t capacity,
+    size_t max_depth,
+    std::unique_ptr<internal::dom_parser_implementation> &dst
+  ) const noexcept = 0;
 
   /**
    * @private For internal implementation use
@@ -2487,50 +2670,6 @@ class implementation {
    */
   WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
 
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 1 of the document parser.
-   *
-   * Overridden by each implementation.
-   *
-   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
-   * @param len the length of the json document.
-   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
-   * @param streaming whether this is being called by parser::parse_many.
-   * @return the error code, or SUCCESS if there was no error.
-   */
-  WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept = 0;
-
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 2 of the document parser.
-   *
-   * Overridden by each implementation.
-   *
-   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
-   * @param len the length of the json document.
-   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
-   * @return the error code, or SUCCESS if there was no error.
-   */
-  WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0;
-
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 2 of the document parser for parser::parse_many.
-   *
-   * Overridden by each implementation.
-   *
-   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
-   * @param len the length of the json document.
-   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
-   * @param next_json the next structural index. Start this at 0 the first time, and it will be updated to the next value to pass each time.
-   * @return the error code, SUCCESS if there was no error, or SUCCESS_AND_HAS_MORE if there was no error and stage2 can be called again.
-   */
-  WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept = 0;
-
 protected:
   /** @private Construct an implementation with the given name and description. For subclasses. */
   really_inline implementation(
@@ -2648,7 +2787,7 @@ extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> activ
 } // namespace simdjson
 
 #endif // SIMDJSON_IMPLEMENTATION_H
-/* end file include/simdjson/implementation.h */
+/* end file include/simdjson/internal/dom_parser_implementation.h */
 /* begin file include/simdjson/dom/array.h */
 #ifndef SIMDJSON_DOM_ARRAY_H
 #define SIMDJSON_DOM_ARRAY_H
@@ -3022,22 +3161,6 @@ class document {
 
 namespace simdjson {
 
-namespace internal {
-
-// expectation: sizeof(scope_descriptor) = 64/8.
-struct scope_descriptor {
-  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
-  uint32_t count; // how many elements in the scope
-}; // struct scope_descriptor
-
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address;
-#else
-typedef char ret_address;
-#endif
-
-} // namespace internal
-
 namespace dom {
 
 class document_stream;
@@ -3075,14 +3198,14 @@ class parser {
    *
    * @param other The parser to take. Its capacity is zeroed.
    */
-  parser(parser &&other) = default;
+  really_inline parser(parser &&other) noexcept;
   parser(const parser &) = delete; ///< @private Disallow copying
   /**
    * Take another parser's buffers and state.
    *
    * @param other The parser to take. Its capacity is zeroed.
    */
-  parser &operator=(parser &&other) = default;
+  really_inline parser &operator=(parser &&other) noexcept;
   parser &operator=(const parser &) = delete; ///< @private Disallow copying
 
   /** Deallocate the JSON parser. */
@@ -3342,7 +3465,8 @@ class parser {
   /**
    * Set max_capacity. This is the largest document this parser can automatically support.
    *
-   * The parser may reallocate internal buffers as needed up to this amount.
+   * The parser may reallocate internal buffers as needed up to this amount as documents are passed
+   * to it.
    *
    * This call will not allocate or deallocate, even if capacity is currently above max_capacity.
    *
@@ -3355,19 +3479,8 @@ class parser {
   /** @private Use simdjson_error instead */
   using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;
 
-  /** @private Next location to write to in the tape */
-  uint32_t current_loc{0};
-
-  /** @private Number of structural indices passed from stage 1 to stage 2 */
-  uint32_t n_structural_indexes{0};
-  /** @private Structural indices passed from stage 1 to stage 2 */
-  std::unique_ptr<uint32_t[]> structural_indexes{};
-
-  /** @private Tape location of each open { or [ */
-  std::unique_ptr<internal::scope_descriptor[]> containing_scope{};
-
-  /** @private Return address of each open { or [ */
-  std::unique_ptr<internal::ret_address[]> ret_address{};
+  /** @private [for benchmarking access] The implementation to use */
+  std::unique_ptr<internal::dom_parser_implementation> implementation{};
 
   /** @private Use `if (parser.parse(...).error())` instead */
   bool valid{false};
@@ -3407,20 +3520,6 @@ class parser {
    */
   size_t _max_capacity;
 
-  /**
-   * The maximum document length this parser supports.
-   *
-   * Buffers are large enough to handle any document up to this length.
-   */
-  size_t _capacity{0};
-
-  /**
-   * The maximum depth (number of nested objects and arrays) supported by this parser.
-   *
-   * Defaults to DEFAULT_MAX_DEPTH.
-   */
-  size_t _max_depth{0};
-
   /**
    * The loaded buffer (reused each time load() is called)
    */
@@ -3500,7 +3599,7 @@ class document_stream {
     really_inline bool operator!=(const iterator &other) const noexcept;
 
   private:
-    iterator(document_stream& stream, bool finished) noexcept;
+    really_inline iterator(document_stream &s, bool finished) noexcept;
     /** The document_stream we're iterating through. */
     document_stream& stream;
     /** Whether we're finished or not. */
@@ -3523,7 +3622,23 @@ class document_stream {
 
   document_stream(document_stream &other) = delete;    // Disallow copying
 
-  really_inline document_stream(dom::parser &parser, const uint8_t *buf, size_t len, size_t batch_size, error_code error = SUCCESS) noexcept;
+  /**
+   * Construct a document_stream. Does not allocate or parse anything until the iterator is
+   * used.
+   */
+  really_inline document_stream(
+    dom::parser &parser,
+    const uint8_t *buf,
+    size_t len,
+    size_t batch_size,
+    error_code error = SUCCESS
+  ) noexcept;
+
+  /**
+   * Parse the first document in the buffer. Used by begin(), to handle allocation and
+   * initialization.
+   */
+  inline void start() noexcept;
 
   /**
    * Parse the next document found in the buffer previously given to document_stream.
@@ -3536,10 +3651,7 @@ class document_stream {
    * pre-allocating a capacity defined by the batch_size defined when creating the
    * document_stream object.
    *
-   * The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in case
-   * of success and indicates that the buffer still contains more data to be parsed,
-   * meaning this function can be called again to return the next JSON document
-   * after this one.
+   * The function returns simdjson::EMPTY if there is no more data to be parsed.
    *
    * The function returns simdjson::SUCCESS (as integer = 0) in case of success
    * and indicates that the buffer has successfully been parsed to the end.
@@ -3550,55 +3662,51 @@ class document_stream {
    * the simdjson::error_message function converts these error codes into a string).
    *
    * You can also check validity by calling parser.is_valid(). The same parser can
-   * and should be reused for the other documents in the buffer. */
-  inline error_code json_parse() noexcept;
-
-  /**
-   * Returns the location (index) of where the next document should be in the
-   * buffer.
-   * Can be used for debugging, it tells the user the position of the end of the
-   * last
-   * valid JSON document parsed
+   * and should be reused for the other documents in the buffer.
    */
-  inline size_t get_current_buffer_loc() const { return current_buffer_loc; }
+  inline void next() noexcept;
 
   /**
-   * Returns the total amount of complete documents parsed by the document_stream,
-   * in the current buffer, at the given time.
+   * Pass the next batch through stage 1 and return when finished.
+   * When threads are enabled, this may wait for the stage 1 thread to finish.
    */
-  inline size_t get_n_parsed_docs() const { return n_parsed_docs; }
-
-  /**
-   * Returns the total amount of data (in bytes) parsed by the document_stream,
-   * in the current buffer, at the given time.
-   */
-  inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; }
-
-  inline const uint8_t *buf() const { return _buf + buf_start; }
+  inline void load_batch() noexcept;
 
-  inline void advance(size_t offset) { buf_start += offset; }
+  /** Get the next document index. */
+  inline size_t next_batch_start() const noexcept;
 
-  inline size_t remaining() const { return _len - buf_start; }
+  /** Pass the next batch through stage 1 with the given parser. */
+  inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept;
 
   dom::parser &parser;
-  const uint8_t *_buf;
-  const size_t _len;
-  size_t _batch_size; // this is actually variable!
-  size_t buf_start{0};
-  size_t next_json{0};
-  bool load_next_batch{true};
-  size_t current_buffer_loc{0};
-#ifdef SIMDJSON_THREADS_ENABLED
-  size_t last_json_buffer_loc{0};
-#endif
-  size_t n_parsed_docs{0};
-  size_t n_bytes_parsed{0};
-  error_code error{SUCCESS_AND_HAS_MORE};
+  const uint8_t *buf;
+  const size_t len;
+  const size_t batch_size;
+  size_t batch_start{0};
+  /** The error (or lack thereof) from the current document. */
+  error_code error;
+
 #ifdef SIMDJSON_THREADS_ENABLED
-  error_code stage1_is_ok_thread{SUCCESS};
-  std::thread stage_1_thread{};
-  dom::parser parser_thread{};
-#endif
+  inline void load_from_stage1_thread() noexcept;
+
+  /** Start a thread to run stage 1 on the next batch. */
+  inline void start_stage1_thread() noexcept;
+
+  /** Wait for the stage 1 thread to finish and capture the results. */
+  inline void finish_stage1_thread() noexcept;
+
+  /** The error returned from the stage 1 thread. */
+  error_code stage1_thread_error{UNINITIALIZED};
+  /** The thread used to run stage 1 against the next batch in the background. */
+  std::thread stage1_thread{};
+
+  /**
+   * The parser used to run stage 1 in the background. Will be swapped
+   * with the regular parser when finished.
+   */
+  dom::parser stage1_thread_parser{};
+#endif // SIMDJSON_THREADS_ENABLED
+
   friend class dom::parser;
 }; // class document_stream
 
@@ -4842,125 +4950,37 @@ inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::ar
 #include <limits>
 #include <stdexcept>
 
-namespace simdjson {
-namespace internal {
-
-/**
- * This algorithm is used to quickly identify the buffer position of
- * the last JSON document inside the current batch.
- *
- * It does its work by finding the last pair of structural characters
- * that represent the end followed by the start of a document.
- *
- * Simply put, we iterate over the structural characters, starting from
- * the end. We consider that we found the end of a JSON document when the
- * first element of the pair is NOT one of these characters: '{' '[' ';' ','
- * and when the second element is NOT one of these characters: '}' '}' ';' ','.
- *
- * This simple comparison works most of the time, but it does not cover cases
- * where the batch's structural indexes contain a perfect amount of documents.
- * In such a case, we do not have access to the structural index which follows
- * the last document, therefore, we do not have access to the second element in
- * the pair, and means that we cannot identify the last document. To fix this
- * issue, we keep a count of the open and closed curly/square braces we found
- * while searching for the pair. When we find a pair AND the count of open and
- * closed curly/square braces is the same, we know that we just passed a
- * complete
- * document, therefore the last json buffer location is the end of the batch
- * */
-inline uint32_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const dom::parser &parser) {
-  // this function can be generally useful
-  if (parser.n_structural_indexes == 0)
-    return 0;
-  auto last_i = parser.n_structural_indexes - 1;
-  if (parser.structural_indexes[last_i] == size) {
-    if (last_i == 0)
-      return 0;
-    last_i = parser.n_structural_indexes - 2;
-  }
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = last_i; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    if (!arr_cnt && !obj_cnt) {
-      return last_i + 1;
-    }
-    return i;
-  }
-  return 0;
-}
-
-// returns true if the provided byte value is an ASCII character
-static inline bool is_ascii(char c) {
-  return ((unsigned char)c) <= 127;
-}
-
-// if the string ends with  UTF-8 values, backtrack
-// up to the first ASCII character. May return 0.
-static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) {
-  while ((len > 0) and (not is_ascii(c[len - 1]))) {
-    len--;
-  }
-  return len;
-}
-
-} // namespace internal
-
-} // namespace simdjson
-
 namespace simdjson {
 namespace dom {
+
 really_inline document_stream::document_stream(
   dom::parser &_parser,
-  const uint8_t *buf,
-  size_t len,
-  size_t batch_size,
+  const uint8_t *_buf,
+  size_t _len,
+  size_t _batch_size,
   error_code _error
 ) noexcept
   : parser{_parser},
-   _buf{buf},
-   _len{len},
-   _batch_size(batch_size),
-   error(_error)
+    buf{_buf},
+    len{_len},
+    batch_size{_batch_size},
+    error{_error}
 {
-  if (!error) { error = json_parse(); }
 }
 
 inline document_stream::~document_stream() noexcept {
 #ifdef SIMDJSON_THREADS_ENABLED
-  if (stage_1_thread.joinable()) {
-    stage_1_thread.join();
+  // TODO kill the thread, why should people have to wait for a non-side-effecting operation to complete
+  if (stage1_thread.joinable()) {
+    stage1_thread.join();
   }
 #endif
 }
 
 really_inline document_stream::iterator document_stream::begin() noexcept {
-  return iterator(*this, false);
+  start();
+  // If there are no documents, we're finished.
+  return iterator(*this, error == EMPTY);
 }
 
 really_inline document_stream::iterator document_stream::end() noexcept {
@@ -4972,17 +4992,15 @@ really_inline document_stream::iterator::iterator(document_stream& _stream, bool
 }
 
 really_inline simdjson_result<element> document_stream::iterator::operator*() noexcept {
-  error_code err = stream.error == SUCCESS_AND_HAS_MORE ? SUCCESS : stream.error;
-  if (err) { return err; }
+  // Once we have yielded any errors, we're finished.
+  if (stream.error) { finished = true; return stream.error; }
   return stream.parser.doc.root();
 }
 
 really_inline document_stream::iterator& document_stream::iterator::operator++() noexcept {
-  if (stream.error == SUCCESS_AND_HAS_MORE) {
-    stream.error = stream.json_parse();
-  } else {
-    finished = true;
-  }
+  stream.next();
+  // If that was the last document, we're finished.
+  if (stream.error == EMPTY) { finished = true; }
   return *this;
 }
 
@@ -4990,130 +5008,96 @@ really_inline bool document_stream::iterator::operator!=(const document_stream::
   return finished != other.finished;
 }
 
+inline void document_stream::start() noexcept {
+  if (error) { return; }
+
+  error = parser.ensure_capacity(batch_size);
+  if (error) { return; }
+
+  // Always run the first stage 1 parse immediately
+  batch_start = 0;
+  error = run_stage1(parser, batch_start);
+  if (error) { return; }
+
 #ifdef SIMDJSON_THREADS_ENABLED
+  if (next_batch_start() < len) {
+    // Kick off the first thread if needed
+    error = stage1_thread_parser.ensure_capacity(batch_size);
+    if (error) { return; }
+    start_stage1_thread();
+    if (error) { return; }
+  }
+#endif // SIMDJSON_THREADS_ENABLED
 
-// threaded version of json_parse
-// todo: simplify this code further
-inline error_code document_stream::json_parse() noexcept {
-  error = parser.ensure_capacity(_batch_size);
-  if (error) { return error; }
-  error = parser_thread.ensure_capacity(_batch_size);
-  if (error) { return error; }
-
-  if (unlikely(load_next_batch)) {
-    // First time loading
-    if (!stage_1_thread.joinable()) {
-      _batch_size = (std::min)(_batch_size, remaining());
-      _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
-      if (_batch_size == 0) {
-        return simdjson::UTF8_ERROR;
-      }
-      auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true));
-      if (stage1_is_ok != simdjson::SUCCESS) {
-        return stage1_is_ok;
-      }
-      uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
-      if (last_index == 0) {
-        if (parser.n_structural_indexes == 0) {
-          return simdjson::EMPTY;
-        }
-      } else {
-        parser.n_structural_indexes = last_index + 1;
-      }
-    }
-    // the second thread is running or done.
-    else {
-      stage_1_thread.join();
-      if (stage1_is_ok_thread != simdjson::SUCCESS) {
-        return stage1_is_ok_thread;
-      }
-      std::swap(parser.structural_indexes, parser_thread.structural_indexes);
-      parser.n_structural_indexes = parser_thread.n_structural_indexes;
-      advance(last_json_buffer_loc);
-      n_bytes_parsed += last_json_buffer_loc;
-    }
-    // let us decide whether we will start a new thread
-    if (remaining() - _batch_size > 0) {
-      last_json_buffer_loc =
-          parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)];
-      _batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc);
-      if (_batch_size > 0) {
-        _batch_size = internal::trimmed_length_safe_utf8(
-            (const char *)(buf() + last_json_buffer_loc), _batch_size);
-        if (_batch_size == 0) {
-          return simdjson::UTF8_ERROR;
-        }
-        // let us capture read-only variables
-        const uint8_t *const b = buf() + last_json_buffer_loc;
-        const size_t bs = _batch_size;
-        // we call the thread on a lambda that will update
-        // this->stage1_is_ok_thread
-        // there is only one thread that may write to this value
-        stage_1_thread = std::thread([this, b, bs] {
-          this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true));
-        });
-      }
-    }
-    next_json = 0;
-    load_next_batch = false;
-  } // load_next_batch
-  error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
-  if (res == simdjson::SUCCESS_AND_HAS_MORE) {
-    n_parsed_docs++;
-    current_buffer_loc = parser.structural_indexes[next_json];
-    load_next_batch = (current_buffer_loc == last_json_buffer_loc);
-  } else if (res == simdjson::SUCCESS) {
-    n_parsed_docs++;
-    if (remaining() > _batch_size) {
-      current_buffer_loc = parser.structural_indexes[next_json - 1];
-      load_next_batch = true;
-      res = simdjson::SUCCESS_AND_HAS_MORE;
-    }
+  next();
+}
+
+inline void document_stream::next() noexcept {
+  if (error) { return; }
+
+  // Load the next document from the batch
+  error = parser.implementation->stage2_next(parser.doc);
+
+  // If that was the last document in the batch, load another batch (if available)
+  while (error == EMPTY) {
+    batch_start = next_batch_start();
+    if (batch_start >= len) { break; }
+
+#ifdef SIMDJSON_THREADS_ENABLED
+    load_from_stage1_thread();
+#else
+    error = run_stage1(parser, batch_start);
+#endif
+    if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
+
+    // Run stage 2 on the first document in the batch
+    error = parser.implementation->stage2_next(parser.doc);
   }
-  return res;
 }
 
-#else  // SIMDJSON_THREADS_ENABLED
+inline size_t document_stream::next_batch_start() const noexcept {
+  return batch_start + parser.implementation->structural_indexes[parser.implementation->n_structural_indexes];
+}
 
-// single-threaded version of json_parse
-inline error_code document_stream::json_parse() noexcept {
-  error = parser.ensure_capacity(_batch_size);
-  if (error) { return error; }
+inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
+  // If this is the final batch, pass partial = false
+  size_t remaining = len - _batch_start;
+  if (remaining <= batch_size) {
+    return p.implementation->stage1(&buf[_batch_start], remaining, false);
+  } else {
+    return p.implementation->stage1(&buf[_batch_start], batch_size, true);
+  }
+}
 
-  if (unlikely(load_next_batch)) {
-    advance(current_buffer_loc);
-    n_bytes_parsed += current_buffer_loc;
-    _batch_size = (std::min)(_batch_size, remaining());
-    _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
-    auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true);
-    if (stage1_is_ok != simdjson::SUCCESS) {
-      return stage1_is_ok;
-    }
-    uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
-    if (last_index == 0) {
-      if (parser.n_structural_indexes == 0) {
-        return EMPTY;
-      }
-    } else {
-      parser.n_structural_indexes = last_index + 1;
-    }
-    load_next_batch = false;
-  } // load_next_batch
-  error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
-  if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) {
-    n_parsed_docs++;
-    current_buffer_loc = parser.structural_indexes[next_json];
-  } else if (res == simdjson::SUCCESS) {
-    n_parsed_docs++;
-    if (remaining() > _batch_size) {
-      current_buffer_loc = parser.structural_indexes[next_json - 1];
-      next_json = 1;
-      load_next_batch = true;
-      res = simdjson::SUCCESS_AND_HAS_MORE;
-    }
+#ifdef SIMDJSON_THREADS_ENABLED
+
+inline void document_stream::load_from_stage1_thread() noexcept {
+  stage1_thread.join();
+
+  // Swap to the parser that was loaded up in the thread. Make sure the parser has
+  // enough memory to swap to, as well.
+  std::swap(parser, stage1_thread_parser);
+  error = stage1_thread_error;
+  if (error) { return; }
+
+  // If there's anything left, start the stage 1 thread!
+  if (next_batch_start() < len) {
+    start_stage1_thread();
   }
-  return res;
 }
+
+inline void document_stream::start_stage1_thread() noexcept {
+  // we call the thread on a lambda that will update
+  // this->stage1_thread_error
+  // there is only one thread that may write to this value
+  // TODO this is NOT exception-safe.
+  this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error
+  size_t _next_batch_start = this->next_batch_start();
+  stage1_thread = std::thread([this, _next_batch_start] {
+    this->stage1_thread_error = run_stage1(this->stage1_thread_parser, _next_batch_start);
+  });
+}
+
 #endif // SIMDJSON_THREADS_ENABLED
 
 } // namespace dom
@@ -5152,7 +5136,7 @@ inline error_code document::allocate(size_t capacity) noexcept {
   // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
   //where len + 1 tape elements are
   // generated, see issue https://github.com/lemire/simdjson/issues/345
-  size_t tape_capacity = ROUNDUP_N(capacity + 2, 64);
+  size_t tape_capacity = ROUNDUP_N(capacity + 3, 64);
   // a document with only zero-length strings... could have len/3 string
   // and we would need len/3 * 5 bytes on the string buffer
   size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64);
@@ -6741,8 +6725,11 @@ namespace dom {
 //
 really_inline parser::parser(size_t max_capacity) noexcept
   : _max_capacity{max_capacity},
-    loaded_bytes(nullptr, &aligned_free_char)
-    {}
+    loaded_bytes(nullptr, &aligned_free_char) {
+}
+really_inline parser::parser(parser &&other) noexcept = default;
+really_inline parser &parser::operator=(parser &&other) noexcept = default;
+
 inline bool parser::is_valid() const noexcept { return valid; }
 inline int parser::get_error_code() const noexcept { return error; }
 inline std::string parser::get_error_message() const noexcept { return error_message(error); }
@@ -6825,15 +6812,12 @@ inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bo
     memcpy((void *)buf, tmp_buf, len);
   }
 
-  code = simdjson::active_implementation->parse(buf, len, *this);
+  code = implementation->parse(buf, len, doc);
   if (realloc_if_needed) {
     aligned_free((void *)buf); // must free before we exit
   }
   if (code) { return code; }
 
-  // We're indicating validity via the simdjson_result<element>, so set the parse state back to invalid
-  valid = false;
-  error = UNINITIALIZED;
   return doc.root();
 }
 really_inline simdjson_result<element> parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept {
@@ -6860,81 +6844,30 @@ inline document_stream parser::parse_many(const padded_string &s, size_t batch_s
 }
 
 really_inline size_t parser::capacity() const noexcept {
-  return _capacity;
+  return implementation ? implementation->capacity() : 0;
 }
 really_inline size_t parser::max_capacity() const noexcept {
   return _max_capacity;
 }
 really_inline size_t parser::max_depth() const noexcept {
-  return _max_depth;
+  return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH;
 }
 
 WARN_UNUSED
 inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept {
   //
-  // If capacity has changed, reallocate capacity-based buffers
-  //
-  if (_capacity != capacity) {
-    // Set capacity to 0 until we finish, in case there's an error
-    _capacity = 0;
-
-    //
-    // Reallocate the document
-    //
-    error_code err = doc.allocate(capacity);
-    if (err) { return err; }
-
-    //
-    // Don't allocate 0 bytes, just return.
-    //
-    if (capacity == 0) {
-      structural_indexes.reset();
-      return SUCCESS;
-    }
-
-    //
-    // Initialize stage 1 output
-    //
-    size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-    structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc
-    if (!structural_indexes) {
-      return MEMALLOC;
-    }
-
-    _capacity = capacity;
-
-  //
-  // If capacity hasn't changed, but the document was taken, allocate a new document.
+  // Reallocate implementation and document if needed
   //
-  } else if (!doc.tape) {
-    error_code err = doc.allocate(capacity);
-    if (err) { return err; }
+  error_code err;
+  if (implementation) {
+    err = implementation->allocate(capacity, max_depth);
+  } else {
+    err = simdjson::active_implementation->create_dom_parser_implementation(capacity, max_depth, implementation);
   }
+  if (err) { return err; }
 
-  //
-  // If max_depth has changed, reallocate those buffers
-  //
-  if (max_depth != _max_depth) {
-    _max_depth = 0;
-
-    if (max_depth == 0) {
-      ret_address.reset();
-      containing_scope.reset();
-      return SUCCESS;
-    }
-
-    //
-    // Initialize stage 2 state
-    //
-    containing_scope.reset(new (std::nothrow) internal::scope_descriptor[max_depth]); // TODO realloc
-    ret_address.reset(new (std::nothrow) internal::ret_address[max_depth]);
-
-    if (!ret_address || !containing_scope) {
-      // Could not allocate memory
-      return MEMALLOC;
-    }
-
-    _max_depth = max_depth;
+  if (implementation->capacity() != capacity || !doc.tape) {
+    return doc.allocate(capacity);
   }
   return SUCCESS;
 }
@@ -6944,24 +6877,24 @@ inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcep
   return !allocate(capacity, max_depth);
 }
 
-really_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
-  _max_capacity = max_capacity;
-}
-
 inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept {
   // If we don't have enough capacity, (try to) automatically bump it.
   // If the document was taken, reallocate that too.
   // Both in one if statement to minimize unlikely branching.
-  if (unlikely(desired_capacity > capacity() || !doc.tape)) {
+  if (unlikely(capacity() < desired_capacity || !doc.tape)) {
     if (desired_capacity > max_capacity()) {
       return error = CAPACITY;
     }
-    return allocate(desired_capacity, _max_depth > 0 ? _max_depth : DEFAULT_MAX_DEPTH);
+    return allocate(desired_capacity, max_depth());
   }
 
   return SUCCESS;
 }
 
+really_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
+  _max_capacity = max_capacity;
+}
+
 } // namespace dom
 } // namespace simdjson
 

From 5f66c8a764b311c80e374170629655401e0fe5a7 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 18:49:46 -0700
Subject: [PATCH 12/16] re-roxygenize(), re-build, ship

---
 R/RcppExports.R     |  4 ++++
 src/RcppExports.cpp | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 4664673..75710c2 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -26,6 +26,10 @@
     .Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
 }
 
+.load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) {
+    .Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
+}
+
 .exceptions_enabled <- function() {
     .Call(`_RcppSimdJson_exceptions_enabled`)
 }
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index df640d0..a149941 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -22,6 +22,23 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// load_json
+SEXP load_json(const std::string& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type);
+RcppExport SEXP _RcppSimdJson_load_json(SEXP file_pathSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const std::string& >::type file_path(file_pathSEXP);
+    Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP);
+    Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP);
+    Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP);
+    Rcpp::traits::input_parameter< const int >::type simplify_to(simplify_toSEXP);
+    Rcpp::traits::input_parameter< const int >::type type_policy(type_policySEXP);
+    Rcpp::traits::input_parameter< const int >::type int64_r_type(int64_r_typeSEXP);
+    rcpp_result_gen = Rcpp::wrap(load_json(file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type));
+    return rcpp_result_gen;
+END_RCPP
+}
 // exceptions_enabled
 bool exceptions_enabled();
 RcppExport SEXP _RcppSimdJson_exceptions_enabled() {
@@ -85,6 +102,7 @@ END_RCPP
 
 static const R_CallMethodDef CallEntries[] = {
     {"_RcppSimdJson_deserialize_json", (DL_FUNC) &_RcppSimdJson_deserialize_json, 7},
+    {"_RcppSimdJson_load_json", (DL_FUNC) &_RcppSimdJson_load_json, 7},
     {"_RcppSimdJson_exceptions_enabled", (DL_FUNC) &_RcppSimdJson_exceptions_enabled, 0},
     {"_RcppSimdJson_check_int64", (DL_FUNC) &_RcppSimdJson_check_int64, 0},
     {"_RcppSimdJson_validateJSON", (DL_FUNC) &_RcppSimdJson_validateJSON, 1},

From c12be19ecd56d1e3660da057ef197890ffad7210 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 19:46:51 -0700
Subject: [PATCH 13/16] fix bad includes

---
 inst/include/RcppSimdJson/common.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/include/RcppSimdJson/common.hpp b/inst/include/RcppSimdJson/common.hpp
index 1261d36..15a3a49 100644
--- a/inst/include/RcppSimdJson/common.hpp
+++ b/inst/include/RcppSimdJson/common.hpp
@@ -126,8 +126,8 @@ enum class Simplify_To : int {
 } // namespace rcppsimdjson
 
 
-#include <simdjson.h>
-#include "RcppSimdJson/utils.hpp"
+#include "../simdjson.h"
+#include "utils.hpp"
 
 
 namespace rcppsimdjson {

From c8e5026c03222de5527df0ed5745c070ae2110a0 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Mon, 15 Jun 2020 20:24:42 -0700
Subject: [PATCH 14/16] fix line deletion missing from commit
 70adc7b24faa90036343f77c9adf2568ce682882

---
 inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
index f3909d5..70bfacc 100644
--- a/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
+++ b/inst/include/RcppSimdJson/deserialize/Type_Doctor.hpp
@@ -222,7 +222,7 @@ inline constexpr auto Type_Doctor<Type_Policy::ints_as_dbls>::common_R_type() co
   if (chr_ && !(dbl_ || i64_ || i32_ || lgl_ || u64_)) {
     return rcpp_T::chr;
   }
-
+  
   if (dbl_ && !(lgl_ || u64_)) { // any number will become double
     return rcpp_T::dbl;
   }

From 4bb2cec129792be1e801fb3b817cefd4e48119b3 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Tue, 16 Jun 2020 17:31:32 -0700
Subject: [PATCH 15/16] revert to previous simdjson (Wed May 20 10:23:07 EDT
 2020)

---
 inst/include/simdjson.cpp | 5847 +++++++++++++++++--------------------
 inst/include/simdjson.h   |  829 +++---
 2 files changed, 3099 insertions(+), 3577 deletions(-)

diff --git a/inst/include/simdjson.cpp b/inst/include/simdjson.cpp
index d99dc8b..a2d815f 100644
--- a/inst/include/simdjson.cpp
+++ b/inst/include/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */
+/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
 /* begin file src/simdjson.cpp */
 #include "simdjson.h"
 
@@ -12,6 +12,7 @@ namespace internal {
 
   SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
     { SUCCESS, "No error" },
+    { SUCCESS_AND_HAS_MORE, "No error and buffer still has more data" },
     { CAPACITY, "This parser can't support a document that big" },
     { MEMALLOC, "Error allocating memory, we're most likely out of memory" },
     { TAPE_ERROR, "The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
@@ -358,6 +359,8 @@ static const uint64_t thintable_epi8[256] = {
 namespace simdjson {
 namespace haswell {
 
+using namespace simdjson::dom;
+
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation(
@@ -365,12 +368,11 @@ class implementation final : public simdjson::implementation {
       "Intel/AMD AVX2",
       instruction_set::AVX2 | instruction_set::PCLMULQDQ | instruction_set::BMI1 | instruction_set::BMI2
   ) {}
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_length,
-    std::unique_ptr<internal::dom_parser_implementation>& dst
-  ) const noexcept final;
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace haswell
@@ -396,12 +398,11 @@ using namespace simdjson::dom;
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", instruction_set::SSE42 | instruction_set::PCLMULQDQ) {}
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_length,
-    std::unique_ptr<internal::dom_parser_implementation>& dst
-  ) const noexcept final;
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace westmere
@@ -427,12 +428,11 @@ using namespace simdjson::dom;
 class implementation final : public simdjson::implementation {
 public:
   really_inline implementation() : simdjson::implementation("arm64", "ARM NEON", instruction_set::NEON) {}
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_length,
-    std::unique_ptr<internal::dom_parser_implementation>& dst
-  ) const noexcept final;
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace arm64
@@ -462,12 +462,11 @@ class implementation final : public simdjson::implementation {
       "Generic fallback implementation",
       0
   ) {}
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_length,
-    std::unique_ptr<internal::dom_parser_implementation>& dst
-  ) const noexcept final;
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser) const noexcept final;
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, parser &parser, size_t &next_json) const noexcept final;
 };
 
 } // namespace fallback
@@ -490,16 +489,21 @@ class detect_best_supported_implementation_on_first_use final : public implement
   const std::string &name() const noexcept final { return set_best()->name(); }
   const std::string &description() const noexcept final { return set_best()->description(); }
   uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_length,
-    std::unique_ptr<internal::dom_parser_implementation>& dst
-  ) const noexcept final {
-    return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
+  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final {
+    return set_best()->parse(buf, len, parser);
   }
   WARN_UNUSED error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
     return set_best()->minify(buf, len, dst, dst_len);
   }
+  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept final {
+    return set_best()->stage1(buf, len, parser, streaming);
+  }
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept final {
+    return set_best()->stage2(buf, len, parser);
+  }
+  WARN_UNUSED error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept final {
+    return set_best()->stage2(buf, len, parser, next_json);
+  }
 
   really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
 private:
@@ -528,16 +532,21 @@ const std::initializer_list<const implementation *> available_implementation_poi
 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
 class unsupported_implementation final : public implementation {
 public:
-  WARN_UNUSED error_code create_dom_parser_implementation(
-    size_t,
-    size_t,
-    std::unique_ptr<internal::dom_parser_implementation>&
-  ) const noexcept final {
+  WARN_UNUSED error_code parse(const uint8_t *, size_t, dom::parser &) const noexcept final {
     return UNSUPPORTED_ARCHITECTURE;
   }
   WARN_UNUSED error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final {
     return UNSUPPORTED_ARCHITECTURE;
   }
+  WARN_UNUSED error_code stage1(const uint8_t *, size_t, dom::parser &, bool) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
+  WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
+  WARN_UNUSED error_code stage2(const uint8_t *, size_t, dom::parser &, size_t &) const noexcept final {
+    return UNSUPPORTED_ARCHITECTURE;
+  }
 
   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
 };
@@ -1933,151 +1942,7 @@ const uint64_t mantissa_128[] = {
 /* simdprune_tables.h already included: #include "simdprune_tables.h" */
 
 #if SIMDJSON_IMPLEMENTATION_ARM64
-/* begin file src/arm64/implementation.cpp */
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
-/* begin file src/arm64/dom_parser_implementation.h */
-#ifndef SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
-#define SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
-
-/* isadetection.h already included: #include "isadetection.h" */
-
-namespace simdjson {
-namespace arm64 {
-
-/* begin file src/generic/dom_parser_implementation.h */
-// expectation: sizeof(scope_descriptor) = 64/8.
-struct scope_descriptor {
-  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
-  uint32_t count; // how many elements in the scope
-}; // struct scope_descriptor
-
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address_t;
-#else
-typedef char ret_address_t;
-#endif
-
-class dom_parser_implementation final : public internal::dom_parser_implementation {
-public:
-  /** Tape location of each open { or [ */
-  std::unique_ptr<scope_descriptor[]> containing_scope{};
-  /** Return address of each open { or [ */
-  std::unique_ptr<ret_address_t[]> ret_address{};
-  /** Buffer passed to stage 1 */
-  const uint8_t *buf{};
-  /** Length passed to stage 1 */
-  size_t len{0};
-  /** Document passed to stage 2 */
-  dom::document *doc{};
-  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
-  error_code error{UNINITIALIZED};
-
-  really_inline dom_parser_implementation();
-  dom_parser_implementation(const dom_parser_implementation &) = delete;
-  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
-
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
-  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
-  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
-  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
-};
-
-/* begin file src/generic/stage1/allocate.h */
-namespace stage1 {
-namespace allocate {
-
-//
-// Allocates stage 1 internal state and outputs in the parser
-//
-really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
-  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
-  if (!parser.structural_indexes) { return MEMALLOC; }
-  parser.structural_indexes[0] = 0;
-  parser.n_structural_indexes = 0;
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage1
-/* end file src/generic/stage1/allocate.h */
-/* begin file src/generic/stage2/allocate.h */
-namespace stage2 {
-namespace allocate {
-
-//
-// Allocates stage 2 internal state and outputs in the parser
-//
-really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
-  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
-  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
-
-  if (!parser.ret_address || !parser.containing_scope) {
-    return MEMALLOC;
-  }
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage2
-/* end file src/generic/stage2/allocate.h */
-
-really_inline dom_parser_implementation::dom_parser_implementation() {}
-
-// Leaving these here so they can be inlined if so desired
-WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
-  error_code err = stage1::allocate::set_capacity(*this, capacity);
-  if (err) { _capacity = 0; return err; }
-  _capacity = capacity;
-  return SUCCESS;
-}
-
-WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
-  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
-  if (err) { _max_depth = 0; return err; }
-  _max_depth = max_depth;
-  return SUCCESS;
-}
-/* end file src/generic/stage2/allocate.h */
-
-} // namespace arm64
-} // namespace simdjson
-
-#endif // SIMDJSON_ARM64_DOM_PARSER_IMPLEMENTATION_H
-/* end file src/generic/stage2/allocate.h */
-
-TARGET_HASWELL
-
-namespace simdjson {
-namespace arm64 {
-
-WARN_UNUSED error_code implementation::create_dom_parser_implementation(
-  size_t capacity,
-  size_t max_depth,
-  std::unique_ptr<internal::dom_parser_implementation>& dst
-) const noexcept {
-  dst.reset( new (std::nothrow) dom_parser_implementation() );
-  if (!dst) { return MEMALLOC; }
-  dst->set_capacity(capacity);
-  dst->set_max_depth(max_depth);
-  return SUCCESS;
-}
-
-} // namespace arm64
-} // namespace simdjson
-
-UNTARGET_REGION
-/* end file src/generic/stage2/allocate.h */
-/* begin file src/arm64/dom_parser_implementation.cpp */
-/* arm64/implementation.h already included: #include "arm64/implementation.h" */
-/* arm64/dom_parser_implementation.h already included: #include "arm64/dom_parser_implementation.h" */
-
-//
-// Stage 1
-//
+/* begin file src/arm64/stage1.cpp */
 /* begin file src/arm64/bitmask.h */
 #ifndef SIMDJSON_ARM64_BITMASK_H
 #define SIMDJSON_ARM64_BITMASK_H
@@ -2729,6 +2594,7 @@ really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_
 #endif // SIMDJSON_ARM64_SIMD_H
 /* end file src/arm64/bitmanipulation.h */
 /* arm64/bitmanipulation.h already included: #include "arm64/bitmanipulation.h" */
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
 
 namespace simdjson {
 namespace arm64 {
@@ -2799,21 +2665,24 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  really_inline size_t block_index();
-  really_inline bool has_full_block() const;
-  really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  really_inline size_t get_remainder(uint8_t *dst) const;
-  really_inline void advance();
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+  really_inline size_t block_index() { return idx; }
+  really_inline bool has_full_block() const {
+    return idx < lenminusstep;
+  }
+  really_inline const uint8_t *full_block() const {
+    return &buf[idx];
+  }
+  really_inline bool has_remainder() const {
+    return idx < len;
+  }
+  really_inline void get_remainder(uint8_t *tmp_buf) const {
+    memset(tmp_buf, 0x20, STEP_SIZE);
+    memcpy(tmp_buf, buf + idx, len - idx);
+  }
+  really_inline void advance() {
+    idx += STEP_SIZE;
+  }
 private:
   const uint8_t *buf;
   const size_t len;
@@ -2821,18 +2690,6 @@ struct buf_block_reader {
   size_t idx;
 };
 
-constexpr const int TITLE_SIZE = 12;
-
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
-
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -2852,34 +2709,6 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
-
-template<size_t STEP_SIZE>
-really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
-
-template<size_t STEP_SIZE>
-really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
-}
-
-template<size_t STEP_SIZE>
-really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
-}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  memcpy(dst, buf + idx, len - idx);
-  return len - idx;
-}
-
-template<size_t STEP_SIZE>
-really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
-}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -3179,15 +3008,13 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
-
-  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  // Index the last (remainder) block, padded with spaces
-  uint8_t block[STEP_SIZE];
-  if (likely(reader.get_remainder(block)) > 0) {
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -3200,94 +3027,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
-/* begin file src/generic/stage1/find_next_document_index.h */
-/**
-  * This algorithm is used to quickly identify the last structural position that
-  * makes up a complete document.
-  *
-  * It does this by going backwards and finding the last *document boundary* (a
-  * place where one value follows another without a comma between them). If the
-  * last document (the characters after the boundary) has an equal number of
-  * start and end brackets, it is considered complete.
-  *
-  * Simply put, we iterate over the structural characters, starting from
-  * the end. We consider that we found the end of a JSON document when the
-  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
-  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
-  *
-  * This simple comparison works most of the time, but it does not cover cases
-  * where the batch's structural indexes contain a perfect amount of documents.
-  * In such a case, we do not have access to the structural index which follows
-  * the last document, therefore, we do not have access to the second element in
-  * the pair, and that means we cannot identify the last document. To fix this
-  * issue, we keep a count of the open and closed curly/square braces we found
-  * while searching for the pair. When we find a pair AND the count of open and
-  * closed curly/square braces is the same, we know that we just passed a
-  * complete document, therefore the last json buffer location is the end of the
-  * batch.
-  */
-really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
-  // TODO don't count separately, just figure out depth
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (parser.buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (parser.buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    // Last document is complete, so the next document will appear after!
-    if (!arr_cnt && !obj_cnt) {
-      return parser.n_structural_indexes;
-    }
-    // Last document is incomplete; mark the document at i + 1 as the next one
-    return i;
-  }
-  return 0;
-}
-
-// Skip the last character if it is partial
-really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
-  if (unlikely(len < 3)) {
-    switch (len) {
-      case 2:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
-        return len;
-      case 1:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        return len;
-      case 0:
-        return len;
-    }
-  }
-  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
-  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
-  return len;
-}
-/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -3338,9 +3077,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//
+//   
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-//
+// 
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -3355,7 +3094,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 using namespace simd;
 
 namespace utf8_validation {
-  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -3400,7 +3138,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,
+      0, 0, 0, 0,                          
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -3431,6 +3169,214 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
+  //
+  // Validate the length of multibyte characters (that each multibyte character has the right number
+  // of continuation characters, and that all continuation characters are part of a multibyte
+  // character).
+  //
+  // Algorithm
+  // =========
+  //
+  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
+  // and emits an error anytime there is a mismatch.
+  //
+  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // characters, the file will look like this:
+  //
+  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
+  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
+  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
+  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
+  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
+  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
+  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  //
+  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
+  //
+  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
+  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
+  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
+  //   or maybe it's at the beginning of the file before any characters have started; but it's an
+  //   error in all these cases.
+  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
+  //   we started a new character before we were finished with the current one.
+  //
+  // Getting the Previous Bytes
+  // --------------------------
+  //
+  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
+  // character, we need to "shift the bytes" to find that out. This is what they mean:
+  //
+  // - `is_continuation`: if the current byte is a continuation.
+  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
+  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
+  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
+  //
+  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
+  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
+  // function, because the 1-byte-back data is used by other checks as well.
+  //
+  // Getting the Continuation Mask
+  // -----------------------------
+  //
+  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
+  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
+  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
+  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
+  //
+  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
+  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
+  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
+  //
+  // When treated as signed numbers, they look like this:
+  //
+  // | Type         | High Bits  | Binary Range | Signed |
+  // |--------------|------------|--------------|--------|
+  // | ASCII        | `0`        | `01111111`   |   127  |
+  // |              |            | `00000000`   |     0  |
+  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
+  // |              |            | `11110000    |   -16  |
+  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
+  // |              |            | `11100000    |   -32  |
+  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
+  // |              |            | `11000000    |   -64  |
+  // | Continuation | `10`       | `10111111`   |   -65  |
+  // |              |            | `10000000    |  -128  |
+  //
+  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
+  //
+  // ```
+  // is_continuation = input < -64`
+  // ```
+  //
+  // We can do something similar for the others, but it takes two comparisons instead of one: "is
+  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
+  // `> -64`. Surely we can do better, they're right next to each other!
+  //
+  // Getting the is_xxx Masks: Shifting the Range
+  // --------------------------------------------
+  //
+  // Notice *why* continuations were a single comparison. The actual *range* would require two
+  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
+  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
+  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
+  //
+  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
+  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
+  //
+  // | Type                 | High Bits  | Binary Range | Signed |
+  // |----------------------|------------|--------------|-------|
+  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
+  // |                      |            | `01110000    |   112 |
+  // |----------------------|------------|--------------|-------|
+  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
+  // |                      |            | `01100000    |    96 |
+  // |----------------------|------------|--------------|-------|
+  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
+  // |                      |            | `01000000    |    64 |
+  // |----------------------|------------|--------------|-------|
+  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
+  // |                      |            | `00000000    |     0 |
+  // |----------------------|------------|--------------|-------|
+  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
+  // |                      |            | `10000000`   |  -128 |
+  // |----------------------|------------|--------------|-------|
+  // 
+  // *Now* we can use signed `>` on all of them:
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev2 = input.prev<2>
+  // prev3 = input.prev<3>
+  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
+  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
+  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
+  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
+  // ```
+  //
+  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
+  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
+  // `^`'s at a time on Haswell, but only 2 `+`'s).
+  //
+  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
+  // same number to all of them, we can save one of those `+ 128` operations by assembling
+  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
+  // to it. One more instruction saved!
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev3 = input.prev<3>
+  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
+  // ```
+  //
+  // ### Bringing It All Together: Detecting the Errors
+  //
+  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
+  // All we have left to do is check if they match!
+  //
+  // ```
+  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
+  // ```
+  //
+  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
+  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
+  // bitwise operations, and we're only using 1!
+  //
+  // Epilogue: Addition For Booleans
+  // -------------------------------
+  //
+  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
+  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
+  // to each other (or any combination), and the continuation could be part of either of them!
+  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
+  //
+  // Never fear, though. If that situation occurs, we'll already have detected that the second
+  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
+  // character, but it *wasn't a continuation*.
+  //
+  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
+  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
+  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
+  // comparisons were giving us numbers!
+  //
+  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
+  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
+  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
+  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
+  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
+  //
+  // Further, if *more than one* multibyte character overlaps,
+  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
+  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
+  // considered an error.
+  //
+  // One reason you might want to do this is parallelism. ^ and | are not associative, so
+  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
+  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
+  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
+  // then adds the result together. Same number of operations, but if the processor can run
+  // independent things in parallel (which most can), it runs faster.
+  //
+  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
+  // a super nice advantage in that more of them can be run at the same time (they can run on 3
+  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
+  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
+  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
+  // out of the way," running on a port other instructions can't.
+  // 
+  // Epilogue II: One More Trick
+  // ---------------------------
+  //
+  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
+  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
+  // check_special_cases()--but we'll talk about that there :)
+  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -3568,22 +3514,16 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
-  /**
-   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-   *
-   * @param partial Setting the partial parameter to true allows the find_structural_bits to
-   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
-   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
-   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes);
+  really_inline json_structural_indexer(uint32_t *structural_indexes)
+  : indexer{structural_indexes} {}
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
+  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -3592,44 +3532,42 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
 
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-//
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
-  if (partial) { len = trim_partial_utf8(buf, len); }
+really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
+  error_code error = scanner.finish(streaming);
+  if (unlikely(error != SUCCESS)) { return error; }
 
-  // Read all but the last block
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
   }
 
-  // Take care of the last block (will always be there unless file is empty)
-  uint8_t block[STEP_SIZE];
-  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
-  indexer.step<STEP_SIZE>(block, reader);
-
-  return indexer.finish(parser, reader.block_index(), len, partial);
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending character. */
+    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
+  }
+  /* make it safe to dereference one beyond this array */
+  parser.structural_indexes[parser.n_structural_indexes] = 0;
+  return checker.errors();
 }
 
 template<>
@@ -3651,76 +3589,61 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
   reader.advance();
 }
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
-
-really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
-
-  error_code error = scanner.finish(partial);
-  if (unlikely(error != SUCCESS)) { return error; }
+//
+// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
+// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
+// you may want to call on a function like trimmed_length_safe_utf8.
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
+  if (unlikely(len > parser.capacity())) { return CAPACITY; }
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /***
-   * This is related to https://github.com/simdjson/simdjson/issues/906
-   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
-   * structural character, it quickly stops.
-   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
-   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
-   * continues, then it must be [,] or }.
-   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
-   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
-   * It can't be a comma, a colon or any simple value. So the only way we could continue is
-   * if the repeated character is [. But if so, the document must start with [. But if the document
-   * starts with [, it should end with ]. If we enforce that rule, then we would get
-   * ][[ which is invalid.
-   **/
-  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
-  parser.next_structural_index = 0;
-  // a valid JSON file cannot have zero structural indexes - we should have found something
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    indexer.step<STEP_SIZE>(block, reader);
   }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (partial) {
-    auto new_structural_indexes = find_next_document_index(parser);
-    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
-      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
-    }
-    parser.n_structural_indexes = new_structural_indexes;
-  }
-  return checker.errors();
+
+  return indexer.finish(parser, reader.block_index(), len, streaming);
 }
 
 } // namespace stage1
 /* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
-  this->buf = _buf;
-  this->len = _len;
-  return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
+WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
+  return arm64::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }
 
 } // namespace arm64
 } // namespace simdjson
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/arm64/stage2.cpp */
+#ifndef SIMDJSON_ARM64_STAGE2_H
+#define SIMDJSON_ARM64_STAGE2_H
 
-//
-// Stage 2
-//
-
+/* arm64/implementation.h already included: #include "arm64/implementation.h" */
 /* begin file src/arm64/stringparsing.h */
 #ifndef SIMDJSON_ARM64_STRINGPARSING_H
 #define SIMDJSON_ARM64_STRINGPARSING_H
@@ -4126,10 +4049,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -4149,6 +4072,26 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
+// We need to check that the character following a zero is valid. This is
+// probably frequent and it is harder than it looks. We are building all of this
+// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
+const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+really_inline bool
+is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
+}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -4226,14 +4169,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -4246,12 +4189,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.append_s64(i);
+      writer.write_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.append_u64(i);
+      writer.write_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -4261,7 +4204,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.append_double(d);
+    writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -4285,10 +4228,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W &writer) {
+                                W writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.append_s64(0);        // always write zero
+  writer.write_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -4308,7 +4251,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_integer(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -4432,7 +4375,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (unlikely(digit_count >= 19)) {
+      if (digit_count >= 19) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -4440,22 +4383,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        bool success = slow_float_parsing((const char *) src, writer);
-        // The number was already written, but we made a copy of the writer
-        // when we passed it to the parse_large_integer() function, so 
-        writer.skip_double();
-        return success;
+        return slow_float_parsing((const char *) src, writer);
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      bool success = slow_float_parsing((const char *) src, writer);
-      // The number was already written, but we made a copy of the writer when we passed it to the
-      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
-      writer.skip_double();
-      return success;
+      return slow_float_parsing((const char *) src, writer);
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -4464,7 +4399,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.append_double(d);
+      writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -4479,14 +4414,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      bool success = parse_large_integer(src, writer, found_minus);
-      // The number was already written, but we made a copy of the writer
-      // when we passed it to the parse_large_integer() function, so 
-      writer.skip_large_integer();
-      return success;
+      return parse_large_integer(src, writer, found_minus);
     }
     i = negative ? 0 - i : i;
-    writer.append_s64(i);
+    writer.write_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -4508,72 +4439,6 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
 namespace simdjson {
 namespace arm64 {
 
-/* begin file src/generic/stage2/logger.h */
-// This is for an internal-only stage 2 specific logger.
-// Set LOG_ENABLED = true to log what stage 2 is doing!
-namespace logger {
-  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
-
-  static constexpr const bool LOG_ENABLED = false;
-  static constexpr const int LOG_EVENT_LEN = 30;
-  static constexpr const int LOG_BUFFER_LEN = 20;
-  static constexpr const int LOG_DETAIL_LEN = 50;
-  static constexpr const int LOG_INDEX_LEN = 10;
-
-  static int log_depth; // Not threadsafe. Log only.
-
-  // Helper to turn unprintable or newline characters into spaces
-  static really_inline char printable_char(char c) {
-    if (c >= 0x20) {
-      return c;
-    } else {
-      return ' ';
-    }
-  }
-
-  // Print the header and set up log_start
-  static really_inline void log_start() {
-    if (LOG_ENABLED) {
-      log_depth = 0;
-      printf("\n");
-      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
-      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
-    }
-  }
-
-  static really_inline void log_string(const char *message) {
-    if (LOG_ENABLED) {
-      printf("%s\n", message);
-    }
-  }
-
-  // Logs a single line of 
-  template<typename S>
-  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
-    if (LOG_ENABLED) {
-      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
-      {
-        // Print the next N characters in the buffer.
-        printf("| ");
-        // Otherwise, print the characters starting from the buffer position.
-        // Print spaces for unprintable or newline characters.
-        for (int i=0;i<LOG_BUFFER_LEN;i++) {
-          printf("%c", printable_char(structurals.current()[i]));
-        }
-        printf(" ");
-      }
-      printf("|    %c ", printable_char(structurals.current_char()));
-      printf("|    %c ", printable_char(structurals.peek_next_char()));
-      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
-      printf("| %5u ", structurals.next_tape_index());
-      printf("| %-*s ", LOG_DETAIL_LEN, detail);
-      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
-      printf("|\n");
-    }
-  }
-} // namespace logger
-
-/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -4632,34 +4497,26 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  const uint8_t* const buf;
-  uint32_t *current_structural;
-  dom_parser_implementation &parser;
-
-  // Start a structural 
-  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
-    : buf{_parser.buf},
-      current_structural{&_parser.structural_indexes[start_structural_index]},
-      parser{_parser} {
-  }
-  // Get the buffer position of the current structural character
-  really_inline const uint8_t* current() {
-    return &buf[*current_structural];
+  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
+    : buf{_buf},
+     len{_len},
+     structural_indexes{_structural_indexes},
+     next_structural{next_structural_index}
+    {}
+  really_inline char advance_char() {
+    idx = structural_indexes[next_structural];
+    next_structural++;
+    c = *current();
+    return c;
   }
-  // Get the current structural character
   really_inline char current_char() {
-    return buf[*current_structural];
+    return c;
   }
-  // Get the next structural character without advancing
-  really_inline char peek_next_char() {
-    return buf[*(current_structural+1)];
-  }
-  really_inline char advance_char() {
-    current_structural++;
-    return buf[*current_structural];
+  really_inline const uint8_t* current() {
+    return &buf[idx];
   }
   really_inline size_t remaining_len() {
-    return parser.len - *current_structural;
+    return len - idx;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -4676,25 +4533,32 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, parser.len);
-    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
+    memcpy(copy, buf, len);
+    memset(copy + len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return current_structural >= &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 > n_structural_indexes;
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return current_structural == &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 == n_structural_indexes;
   }
-  really_inline bool at_beginning() {
-    return current_structural == parser.structural_indexes.get();
+  really_inline size_t next_structural_index() {
+    return next_structural;
   }
+
+  const uint8_t* const buf;
+  const size_t len;
+  const uint32_t* const structural_indexes;
+  size_t next_structural; // next structural index
+  size_t idx{0}; // location of the structural character in the input (buf)
+  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -4706,105 +4570,8 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
-namespace { // Make everything here private
-
-/* begin file src/generic/stage2/tape_writer.h */
-struct tape_writer {
-  /** The next place to write to tape */
-  uint64_t *next_tape_loc;
-  
-  /** Write a signed 64-bit value to tape. */
-  really_inline void append_s64(int64_t value) noexcept;
-
-  /** Write an unsigned 64-bit value to tape. */
-  really_inline void append_u64(uint64_t value) noexcept;
-
-  /** Write a double value to tape. */
-  really_inline void append_double(double value) noexcept;
-
-  /**
-   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
-   */
-  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
-
-  /**
-   * Skip the current tape entry without writing.
-   *
-   * Used to skip the start of the container, since we'll come back later to fill it in when the
-   * container ends.
-   */
-  really_inline void skip() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a large u64 or i64.
-   */
-  really_inline void skip_large_integer() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a double.
-   */
-  really_inline void skip_double() noexcept;
-
-  /**
-   * Write a value to a known location on tape.
-   *
-   * Used to go back and write out the start of a container after the container ends.
-   */
-  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
-
-private:
-  /**
-   * Append both the tape entry, and a supplementary value following it. Used for types that need
-   * all 64 bits, such as double and uint64_t.
-   */
-  template<typename T>
-  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
-}; // struct number_writer
-
-really_inline void tape_writer::append_s64(int64_t value) noexcept {
-  append2(0, value, internal::tape_type::INT64);
-}
-
-really_inline void tape_writer::append_u64(uint64_t value) noexcept {
-  append(0, internal::tape_type::UINT64);
-  *next_tape_loc = value;
-  next_tape_loc++;
-}
-
-/** Write a double value to tape. */
-really_inline void tape_writer::append_double(double value) noexcept {
-  append2(0, value, internal::tape_type::DOUBLE);
-}
 
-really_inline void tape_writer::skip() noexcept {
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::skip_large_integer() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::skip_double() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
-  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
-  next_tape_loc++;
-}
-
-template<typename T>
-really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
-  append(val, t);
-  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
-  memcpy(next_tape_loc, &val2, sizeof(val2));
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
-  tape_loc = val | ((uint64_t(char(t))) << 56);
-}
-/* end file src/generic/stage2/tape_writer.h */
+using internal::ret_address;
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -4835,88 +4602,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address_t array_begin;
-  ret_address_t array_continue;
-  ret_address_t error;
-  ret_address_t finish;
-  ret_address_t object_begin;
-  ret_address_t object_continue;
+  ret_address array_begin;
+  ret_address array_continue;
+  ret_address error;
+  ret_address finish;
+  ret_address object_begin;
+  ret_address object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct structural_parser : structural_iterator {
-  /** Lets you append to the tape */
-  tape_writer tape;
+struct number_writer {
+  parser &doc_parser;
+  
+  really_inline void write_s64(int64_t value) noexcept {
+    write_tape(0, internal::tape_type::INT64);
+    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
+    ++doc_parser.current_loc;
+  }
+  really_inline void write_u64(uint64_t value) noexcept {
+    write_tape(0, internal::tape_type::UINT64);
+    doc_parser.doc.tape[doc_parser.current_loc++] = value;
+  }
+  really_inline void write_double(double value) noexcept {
+    write_tape(0, internal::tape_type::DOUBLE);
+    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
+    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
+    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
+  }
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+  }
+}; // struct number_writer
+
+struct structural_parser {
+  structural_iterator structurals;
+  parser &doc_parser;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc;
-  /** Current depth (nested objects and arrays) */
-  uint32_t depth{0};
-
-  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
-  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
-    : structural_iterator(_parser, start_structural_index),
-      tape{parser.doc->tape.get()},
-      current_string_buf_loc{parser.doc->string_buf.get()} {
-  }
-
-  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
-    parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc{};
+  uint32_t depth;
+
+  really_inline structural_parser(
+    const uint8_t *buf,
+    size_t len,
+    parser &_doc_parser,
+    uint32_t next_structural = 0
+  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+
+  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
+    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
+    doc_parser.containing_scope[depth].count = 0;
+    write_tape(0, type); // if the document is correct, this gets rewritten later
+    doc_parser.ret_address[depth] = continue_state;
     depth++;
-    bool exceeded_max_depth = depth >= parser.max_depth();
-    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
-    return exceeded_max_depth;
+    return depth >= doc_parser.max_depth();
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
-    log_start_value("document");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
+    return start_scope(internal::tape_type::ROOT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
-    log_start_value("object");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_OBJECT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
-    log_start_value("array");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_ARRAY, continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
+  really_inline void end_scope(internal::tape_type type) noexcept {
     depth--;
-    // write our doc->tape location to the header scope
+    // write our doc.tape location to the header scope
     // The root scope gets written *at* the previous location.
-    tape.append(parser.containing_scope[depth].tape_index, end);
+    write_tape(doc_parser.containing_scope[depth].tape_index, type);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
-    const uint32_t count = parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
+    const uint32_t count = doc_parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
-  }
-
-  really_inline uint32_t next_tape_index() {
-    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
+    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
   }
 
   really_inline void end_object() {
-    log_end_value("object");
-    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+    end_scope(internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    log_end_value("array");
-    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+    end_scope(internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    log_end_value("document");
-    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+    end_scope(internal::tape_type::ROOT);
+  }
+
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -4924,16 +4705,17 @@ struct structural_parser : structural_iterator {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    // we advance the point, accounting for the fact that we have a NULL termination
-    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+    /* we advance the point, accounting for the fact that we have a NULL
+      * termination         */
+    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline void on_end_string(uint8_t *dst) noexcept {
+  really_inline bool on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -4943,49 +4725,73 @@ struct structural_parser : structural_iterator {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
+    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string(bool key = false) {
-    log_value(key ? "key" : "string");
+  WARN_UNUSED really_inline bool parse_string() {
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(current(), dst);
+    dst = stringparsing::parse_string(structurals.current(), dst);
     if (dst == nullptr) {
-      log_error("Invalid escape in string");
       return true;
     }
-    on_end_string(dst);
-    return false;
+    return !on_end_string(dst);
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    log_value("number");
-    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
-    if (!succeeded) { log_error("Invalid number"); }
-    return !succeeded;
+    number_writer writer{doc_parser};
+    return !numberparsing::parse_number(src, found_minus, writer);
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(current(), found_minus);
+    return parse_number(structurals.current(), found_minus);
+  }
+
+  WARN_UNUSED really_inline bool parse_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
   }
 
-  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
-    switch (advance_char()) {
+  WARN_UNUSED really_inline bool parse_single_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
+  }
+
+  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
+    switch (structurals.current_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't':
-      log_value("true");
-      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
-      tape.append(0, internal::tape_type::TRUE_VALUE);
-      return continue_state;
-    case 'f':
-      log_value("false");
-      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
-      tape.append(0, internal::tape_type::FALSE_VALUE);
-      return continue_state;
-    case 'n':
-      log_value("null");
-      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
-      tape.append(0, internal::tape_type::NULL_VALUE);
+    case 't': case 'f': case 'n':
+      FAIL_IF( parse_atom() );
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -5001,27 +4807,40 @@ struct structural_parser : structural_iterator {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
-      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
+    // the string might not be NULL terminated.
+    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
     end_document();
-    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
-
     if (depth != 0) {
-      log_error("Unclosed objects or arrays!");
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
     }
 
-    return SUCCESS;
+    return on_success(SUCCESS);
+  }
+
+  really_inline error_code on_error(error_code new_error_code) noexcept {
+    doc_parser.error = new_error_code;
+    return new_error_code;
+  }
+  really_inline error_code on_success(error_code success_code) noexcept {
+    doc_parser.error = success_code;
+    doc_parser.valid = true;
+    return success_code;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by parser.init_stage2(),
+    /* We do not need the next line because this is done by doc_parser.init_stage2(),
     * pessimistically.
-    * parser.is_valid  = false;
+    * doc_parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -5029,12 +4848,12 @@ struct structural_parser : structural_iterator {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= parser.max_depth()) {
-      return parser.error = DEPTH_ERROR;
+    if (depth >= doc_parser.max_depth()) {
+      return on_error(DEPTH_ERROR);
     }
-    switch (current_char()) {
+    switch (structurals.current_char()) {
     case '"':
-      return parser.error = STRING_ERROR;
+      return on_error(STRING_ERROR);
     case '0':
     case '1':
     case '2':
@@ -5046,124 +4865,92 @@ struct structural_parser : structural_iterator {
     case '8':
     case '9':
     case '-':
-      return parser.error = NUMBER_ERROR;
+      return on_error(NUMBER_ERROR);
     case 't':
-      return parser.error = T_ATOM_ERROR;
+      return on_error(T_ATOM_ERROR);
     case 'n':
-      return parser.error = N_ATOM_ERROR;
+      return on_error(N_ATOM_ERROR);
     case 'f':
-      return parser.error = F_ATOM_ERROR;
+      return on_error(F_ATOM_ERROR);
     default:
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
     }
   }
 
   really_inline void init() {
-    log_start();
-    parser.error = UNINITIALIZED;
+    current_string_buf_loc = doc_parser.doc.string_buf.get();
+    doc_parser.current_loc = 0;
+    doc_parser.valid = false;
+    doc_parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
-    // If there are no structurals left, return EMPTY
-    if (at_end(parser.n_structural_indexes)) {
-      return parser.error = EMPTY;
+  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
+    init(); // sets is_valid to false
+    if (len > doc_parser.capacity()) {
+      return CAPACITY;
     }
-
-    init();
+    // Advance to the first character as soon as possible
+    structurals.advance_char();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return parser.error = DEPTH_ERROR;
+      return on_error(DEPTH_ERROR);
     }
     return SUCCESS;
   }
 
-  really_inline void log_value(const char *type) {
-    logger::log_line(*this, "", type, "");
-  }
-
-  static really_inline void log_start() {
-    logger::log_start();
-  }
-
-  really_inline void log_start_value(const char *type) {
-    logger::log_line(*this, "+", type, "");
-    if (logger::LOG_ENABLED) { logger::log_depth++; }
-  }
-
-  really_inline void log_end_value(const char *type) {
-    if (logger::LOG_ENABLED) { logger::log_depth--; }
-    logger::log_line(*this, "-", type, "");
-  }
-
-  really_inline void log_error(const char *error) {
-    logger::log_line(*this, "", "ERROR", error);
+  really_inline char advance_char() {
+    return structurals.advance_char();
   }
-}; // struct structural_parser
+};
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-template<bool STREAMING>
-WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
-  dom_parser.doc = &doc;
+} // namespace stage2
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
-  error_code result = parser.start(addresses.finish);
+  stage2::structural_parser parser(buf, len, doc_parser);
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.current_char()) {
+  switch (parser.structurals.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
-    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
-    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
-    if (!STREAMING) {
-      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
-        goto error;
-      }
-    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't':
-    parser.log_value("true");
-    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
-    goto finish;
-  case 'f':
-    parser.log_value("false");
-    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
-    goto finish;
-  case 'n':
-    parser.log_value("null");
-    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::NULL_VALUE);
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
-    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -5174,45 +4961,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.advance_char();
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.parser.ret_address[parser.depth] );
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.peek_next_char() == ']') {
-    parser.advance_char();
+  if (parser.advance_char() == ']') {
     parser.end_array();
     goto scope_end;
   }
@@ -5227,12 +5012,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
+    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
-    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -5243,298 +5028,194 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   return parser.error();
 }
 
-} // namespace {}
+WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+  error_code code = stage1(buf, len, doc_parser, false);
+  if (!code) {
+    code = stage2(buf, len, doc_parser);
+  }
+  return code;
+}
+/* end file src/generic/stage2/structural_parser.h */
+/* begin file src/generic/stage2/streaming_structural_parser.h */
+namespace stage2 {
+
+struct streaming_structural_parser: structural_parser {
+  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
+
+  // override to add streaming
+  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
+    init(); // sets is_valid to false
+    // Capacity ain't no thang for streaming, so we don't check it.
+    // Advance to the first character as soon as possible
+    advance_char();
+    // Push the root scope (there is always at least one scope)
+    if (start_document(finish_parser)) {
+      return on_error(DEPTH_ERROR);
+    }
+    return SUCCESS;
+  }
+
+  // override to add streaming
+  WARN_UNUSED really_inline error_code finish() {
+    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
+    end_document();
+    if (depth != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
+    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
+  }
+};
+
 } // namespace stage2
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
-  error_code result = stage2::parse_structurals<false>(*this, _doc);
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
+  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
+  //
+  // Read first value
+  //
+  switch (parser.structurals.current_char()) {
+  case '{':
+    FAIL_IF( parser.start_object(addresses.finish) );
+    goto object_begin;
+  case '[':
+    FAIL_IF( parser.start_array(addresses.finish) );
+    goto array_begin;
+  case '"':
+    FAIL_IF( parser.parse_string() );
+    goto finish;
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
+    goto finish;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], false);
+      })
+    );
+    goto finish;
+  case '-':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], true);
+      })
+    );
+    goto finish;
+  default:
+    goto error;
+  }
 
-  // If we didn't make it to the end, it's an error
-  if ( next_structural_index != n_structural_indexes ) {
-    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
-    return error = TAPE_ERROR;
+//
+// Object parser parsers
+//
+object_begin:
+  switch (parser.advance_char()) {
+  case '"': {
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  }
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
   }
 
-  return SUCCESS;
-}
+object_key_parser:
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.increment_count();
+  parser.advance_char();
+  GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
-  return stage2::parse_structurals<true>(*this, _doc);
-}
-/* end file src/generic/stage2/tape_writer.h */
+object_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+scope_end:
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+
+//
+// Array parser parsers
+//
+array_begin:
+  if (parser.advance_char() == ']') {
+    parser.end_array();
+    goto scope_end;
+  }
+  parser.increment_count();
+
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+
+array_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    parser.increment_count();
+    parser.advance_char();
+    goto main_array_switch;
+  case ']':
+    parser.end_array();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+finish:
+  next_json = parser.structurals.next_structural_index();
+  return parser.finish();
 
-WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
-  error_code err = stage1(_buf, _len, false);
-  if (err) { return err; }
-  return stage2(_doc);
+error:
+  return parser.error();
 }
+/* end file src/generic/stage2/streaming_structural_parser.h */
 
 } // namespace arm64
 } // namespace simdjson
-/* end file src/generic/stage2/tape_writer.h */
+
+#endif // SIMDJSON_ARM64_STAGE2_H
+/* end file src/generic/stage2/streaming_structural_parser.h */
 #endif
 #if SIMDJSON_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/implementation.cpp */
+/* begin file src/fallback/stage1.cpp */
 /* fallback/implementation.h already included: #include "fallback/implementation.h" */
-/* begin file src/fallback/dom_parser_implementation.h */
-#ifndef SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
-#define SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
-
-/* isadetection.h already included: #include "isadetection.h" */
 
 namespace simdjson {
 namespace fallback {
+namespace stage1 {
 
-/* begin file src/generic/dom_parser_implementation.h */
-// expectation: sizeof(scope_descriptor) = 64/8.
-struct scope_descriptor {
-  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
-  uint32_t count; // how many elements in the scope
-}; // struct scope_descriptor
-
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address_t;
-#else
-typedef char ret_address_t;
-#endif
-
-class dom_parser_implementation final : public internal::dom_parser_implementation {
+class structural_scanner {
 public:
-  /** Tape location of each open { or [ */
-  std::unique_ptr<scope_descriptor[]> containing_scope{};
-  /** Return address of each open { or [ */
-  std::unique_ptr<ret_address_t[]> ret_address{};
-  /** Buffer passed to stage 1 */
-  const uint8_t *buf{};
-  /** Length passed to stage 1 */
-  size_t len{0};
-  /** Document passed to stage 2 */
-  dom::document *doc{};
-  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
-  error_code error{UNINITIALIZED};
-
-  really_inline dom_parser_implementation();
-  dom_parser_implementation(const dom_parser_implementation &) = delete;
-  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
-
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
-  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
-  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
-  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
-};
 
-/* begin file src/generic/stage1/allocate.h */
-namespace stage1 {
-namespace allocate {
+really_inline structural_scanner(const uint8_t *_buf, uint32_t _len, parser &_doc_parser, bool _streaming)
+  : buf{_buf}, next_structural_index{_doc_parser.structural_indexes.get()}, doc_parser{_doc_parser}, idx{0}, len{_len}, error{SUCCESS}, streaming{_streaming} {}
 
-//
-// Allocates stage 1 internal state and outputs in the parser
-//
-really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
-  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
-  if (!parser.structural_indexes) { return MEMALLOC; }
-  parser.structural_indexes[0] = 0;
-  parser.n_structural_indexes = 0;
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage1
-/* end file src/generic/stage1/allocate.h */
-/* begin file src/generic/stage2/allocate.h */
-namespace stage2 {
-namespace allocate {
-
-//
-// Allocates stage 2 internal state and outputs in the parser
-//
-really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
-  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
-  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
-
-  if (!parser.ret_address || !parser.containing_scope) {
-    return MEMALLOC;
-  }
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage2
-/* end file src/generic/stage2/allocate.h */
-
-really_inline dom_parser_implementation::dom_parser_implementation() {}
-
-// Leaving these here so they can be inlined if so desired
-WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
-  error_code err = stage1::allocate::set_capacity(*this, capacity);
-  if (err) { _capacity = 0; return err; }
-  _capacity = capacity;
-  return SUCCESS;
-}
-
-WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
-  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
-  if (err) { _max_depth = 0; return err; }
-  _max_depth = max_depth;
-  return SUCCESS;
-}
-/* end file src/generic/stage2/allocate.h */
-
-} // namespace fallback
-} // namespace simdjson
-
-#endif // SIMDJSON_FALLBACK_DOM_PARSER_IMPLEMENTATION_H
-/* end file src/generic/stage2/allocate.h */
-
-TARGET_HASWELL
-
-namespace simdjson {
-namespace fallback {
-
-WARN_UNUSED error_code implementation::create_dom_parser_implementation(
-  size_t capacity,
-  size_t max_depth,
-  std::unique_ptr<internal::dom_parser_implementation>& dst
-) const noexcept {
-  dst.reset( new (std::nothrow) dom_parser_implementation() );
-  if (!dst) { return MEMALLOC; }
-  dst->set_capacity(capacity);
-  dst->set_max_depth(max_depth);
-  return SUCCESS;
-}
-
-} // namespace fallback
-} // namespace simdjson
-
-UNTARGET_REGION
-/* end file src/generic/stage2/allocate.h */
-/* begin file src/fallback/dom_parser_implementation.cpp */
-/* fallback/implementation.h already included: #include "fallback/implementation.h" */
-/* fallback/dom_parser_implementation.h already included: #include "fallback/dom_parser_implementation.h" */
-
-//
-// Stage 1
-//
-namespace simdjson {
-namespace fallback {
-namespace stage1 {
-
-/* begin file src/generic/stage1/find_next_document_index.h */
-/**
-  * This algorithm is used to quickly identify the last structural position that
-  * makes up a complete document.
-  *
-  * It does this by going backwards and finding the last *document boundary* (a
-  * place where one value follows another without a comma between them). If the
-  * last document (the characters after the boundary) has an equal number of
-  * start and end brackets, it is considered complete.
-  *
-  * Simply put, we iterate over the structural characters, starting from
-  * the end. We consider that we found the end of a JSON document when the
-  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
-  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
-  *
-  * This simple comparison works most of the time, but it does not cover cases
-  * where the batch's structural indexes contain a perfect amount of documents.
-  * In such a case, we do not have access to the structural index which follows
-  * the last document, therefore, we do not have access to the second element in
-  * the pair, and that means we cannot identify the last document. To fix this
-  * issue, we keep a count of the open and closed curly/square braces we found
-  * while searching for the pair. When we find a pair AND the count of open and
-  * closed curly/square braces is the same, we know that we just passed a
-  * complete document, therefore the last json buffer location is the end of the
-  * batch.
-  */
-really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
-  // TODO don't count separately, just figure out depth
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (parser.buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (parser.buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    // Last document is complete, so the next document will appear after!
-    if (!arr_cnt && !obj_cnt) {
-      return parser.n_structural_indexes;
-    }
-    // Last document is incomplete; mark the document at i + 1 as the next one
-    return i;
-  }
-  return 0;
-}
-
-// Skip the last character if it is partial
-really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
-  if (unlikely(len < 3)) {
-    switch (len) {
-      case 2:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
-        return len;
-      case 1:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        return len;
-      case 0:
-        return len;
-    }
-  }
-  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
-  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
-  return len;
-}
-/* end file src/generic/stage1/find_next_document_index.h */
-
-class structural_scanner {
-public:
-
-really_inline structural_scanner(dom_parser_implementation &_parser, bool _partial)
-  : buf{_parser.buf},
-    next_structural_index{_parser.structural_indexes.get()},
-    parser{_parser},
-    len{static_cast<uint32_t>(_parser.len)},
-    partial{_partial} {
-}
-
-really_inline void add_structural() {
-  *next_structural_index = idx;
-  next_structural_index++;
+really_inline void add_structural() {
+  *next_structural_index = idx;
+  next_structural_index++;
 }
 
 really_inline bool is_continuation(uint8_t c) {
@@ -5553,12 +5234,7 @@ really_inline void validate_utf8_character() {
   // 2-byte
   if ((buf[idx] & 0b00100000) == 0) {
     // missing continuation
-    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
-      if (idx+1 > len && partial) { idx = len; return; }
-      error = UTF8_ERROR;
-      idx++;
-      return;
-    }
+    if (unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) { error = UTF8_ERROR; idx++; return; }
     // overlong: 1100000_ 10______
     if (buf[idx] <= 0b11000001) { error = UTF8_ERROR; }
     idx += 2;
@@ -5568,12 +5244,7 @@ really_inline void validate_utf8_character() {
   // 3-byte
   if ((buf[idx] & 0b00010000) == 0) {
     // missing continuation
-    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
-      if (idx+2 > len && partial) { idx = len; return; }
-      error = UTF8_ERROR;
-      idx++;
-      return;
-    }
+    if (unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) { error = UTF8_ERROR; idx++; return; }
     // overlong: 11100000 100_____ ________
     if (buf[idx] == 0b11100000 && buf[idx+1] <= 0b10011111) { error = UTF8_ERROR; }
     // surrogates: U+D800-U+DFFF 11101101 101_____
@@ -5584,12 +5255,7 @@ really_inline void validate_utf8_character() {
 
   // 4-byte
   // missing continuation
-  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
-    if (idx+2 > len && partial) { idx = len; return; }
-    error = UTF8_ERROR;
-    idx++;
-    return;
-  }
+  if (unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) { error = UTF8_ERROR; idx++; return; }
   // overlong: 11110000 1000____ ________ ________
   if (buf[idx] == 0b11110000 && buf[idx+1] <= 0b10001111) { error = UTF8_ERROR; }
   // too large: > U+10FFFF:
@@ -5614,7 +5280,7 @@ really_inline void validate_string() {
       idx++;
     }
   }
-  if (idx >= len && !partial) { error = UNCLOSED_STRING; }
+  if (idx >= len && !streaming) { error = UNCLOSED_STRING; }
 }
 
 really_inline bool is_whitespace_or_operator(uint8_t c) {
@@ -5655,46 +5321,33 @@ really_inline error_code scan() {
         break;
     }
   }
-  *next_structural_index = len;
-  // We pad beyond.
-  // https://github.com/simdjson/simdjson/issues/906
-  next_structural_index[1] = len;
-  next_structural_index[2] = 0;
-  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
-  parser.next_structural_index = 0;
-
-  if (unlikely(parser.n_structural_indexes == 0)) {
+  if (unlikely(next_structural_index == doc_parser.structural_indexes.get())) {
     return EMPTY;
   }
-
-  if (partial) {
-    auto new_structural_indexes = find_next_document_index(parser);
-    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
-      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
-    }
-    parser.n_structural_indexes = new_structural_indexes;
-  }
-
+  *next_structural_index = len;
+  next_structural_index++;
+  doc_parser.n_structural_indexes = uint32_t(next_structural_index - doc_parser.structural_indexes.get());
   return error;
 }
 
 private:
   const uint8_t *buf;
   uint32_t *next_structural_index;
-  dom_parser_implementation &parser;
+  parser &doc_parser;
+  uint32_t idx;
   uint32_t len;
-  uint32_t idx{0};
-  error_code error{SUCCESS};
-  bool partial;
+  error_code error;
+  bool streaming;
 }; // structural_scanner
 
 } // namespace stage1
 
 
-WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool partial) noexcept {
-  this->buf = _buf;
-  this->len = _len;
-  stage1::structural_scanner scanner(*this, partial);
+WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
+  if (unlikely(len > parser.capacity())) {
+    return CAPACITY;
+  }
+  stage1::structural_scanner scanner(buf, uint32_t(len), parser, streaming);
   return scanner.scan();
 }
 
@@ -5756,10 +5409,10 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
 
 } // namespace fallback
 } // namespace simdjson
+/* end file src/fallback/stage1.cpp */
+/* begin file src/fallback/stage2.cpp */
 
-//
-// Stage 2
-//
+/* fallback/implementation.h already included: #include "fallback/implementation.h" */
 /* begin file src/fallback/stringparsing.h */
 #ifndef SIMDJSON_FALLBACK_STRINGPARSING_H
 #define SIMDJSON_FALLBACK_STRINGPARSING_H
@@ -6219,10 +5872,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -6242,6 +5895,26 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
+// We need to check that the character following a zero is valid. This is
+// probably frequent and it is harder than it looks. We are building all of this
+// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
+const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+really_inline bool
+is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
+}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -6319,14 +5992,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -6339,12 +6012,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.append_s64(i);
+      writer.write_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.append_u64(i);
+      writer.write_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -6354,7 +6027,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.append_double(d);
+    writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -6378,10 +6051,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W &writer) {
+                                W writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.append_s64(0);        // always write zero
+  writer.write_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -6401,7 +6074,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_integer(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -6525,7 +6198,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (unlikely(digit_count >= 19)) {
+      if (digit_count >= 19) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -6533,22 +6206,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        bool success = slow_float_parsing((const char *) src, writer);
-        // The number was already written, but we made a copy of the writer
-        // when we passed it to the parse_large_integer() function, so 
-        writer.skip_double();
-        return success;
+        return slow_float_parsing((const char *) src, writer);
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      bool success = slow_float_parsing((const char *) src, writer);
-      // The number was already written, but we made a copy of the writer when we passed it to the
-      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
-      writer.skip_double();
-      return success;
+      return slow_float_parsing((const char *) src, writer);
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -6557,7 +6222,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.append_double(d);
+      writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -6572,14 +6237,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      bool success = parse_large_integer(src, writer, found_minus);
-      // The number was already written, but we made a copy of the writer
-      // when we passed it to the parse_large_integer() function, so 
-      writer.skip_large_integer();
-      return success;
+      return parse_large_integer(src, writer, found_minus);
     }
     i = negative ? 0 - i : i;
-    writer.append_s64(i);
+    writer.write_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -6602,72 +6263,6 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
 namespace simdjson {
 namespace fallback {
 
-/* begin file src/generic/stage2/logger.h */
-// This is for an internal-only stage 2 specific logger.
-// Set LOG_ENABLED = true to log what stage 2 is doing!
-namespace logger {
-  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
-
-  static constexpr const bool LOG_ENABLED = false;
-  static constexpr const int LOG_EVENT_LEN = 30;
-  static constexpr const int LOG_BUFFER_LEN = 20;
-  static constexpr const int LOG_DETAIL_LEN = 50;
-  static constexpr const int LOG_INDEX_LEN = 10;
-
-  static int log_depth; // Not threadsafe. Log only.
-
-  // Helper to turn unprintable or newline characters into spaces
-  static really_inline char printable_char(char c) {
-    if (c >= 0x20) {
-      return c;
-    } else {
-      return ' ';
-    }
-  }
-
-  // Print the header and set up log_start
-  static really_inline void log_start() {
-    if (LOG_ENABLED) {
-      log_depth = 0;
-      printf("\n");
-      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
-      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
-    }
-  }
-
-  static really_inline void log_string(const char *message) {
-    if (LOG_ENABLED) {
-      printf("%s\n", message);
-    }
-  }
-
-  // Logs a single line of 
-  template<typename S>
-  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
-    if (LOG_ENABLED) {
-      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
-      {
-        // Print the next N characters in the buffer.
-        printf("| ");
-        // Otherwise, print the characters starting from the buffer position.
-        // Print spaces for unprintable or newline characters.
-        for (int i=0;i<LOG_BUFFER_LEN;i++) {
-          printf("%c", printable_char(structurals.current()[i]));
-        }
-        printf(" ");
-      }
-      printf("|    %c ", printable_char(structurals.current_char()));
-      printf("|    %c ", printable_char(structurals.peek_next_char()));
-      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
-      printf("| %5u ", structurals.next_tape_index());
-      printf("| %-*s ", LOG_DETAIL_LEN, detail);
-      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
-      printf("|\n");
-    }
-  }
-} // namespace logger
-
-/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -6726,34 +6321,26 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  const uint8_t* const buf;
-  uint32_t *current_structural;
-  dom_parser_implementation &parser;
-
-  // Start a structural 
-  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
-    : buf{_parser.buf},
-      current_structural{&_parser.structural_indexes[start_structural_index]},
-      parser{_parser} {
-  }
-  // Get the buffer position of the current structural character
-  really_inline const uint8_t* current() {
-    return &buf[*current_structural];
+  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
+    : buf{_buf},
+     len{_len},
+     structural_indexes{_structural_indexes},
+     next_structural{next_structural_index}
+    {}
+  really_inline char advance_char() {
+    idx = structural_indexes[next_structural];
+    next_structural++;
+    c = *current();
+    return c;
   }
-  // Get the current structural character
   really_inline char current_char() {
-    return buf[*current_structural];
-  }
-  // Get the next structural character without advancing
-  really_inline char peek_next_char() {
-    return buf[*(current_structural+1)];
+    return c;
   }
-  really_inline char advance_char() {
-    current_structural++;
-    return buf[*current_structural];
+  really_inline const uint8_t* current() {
+    return &buf[idx];
   }
   really_inline size_t remaining_len() {
-    return parser.len - *current_structural;
+    return len - idx;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -6770,25 +6357,32 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, parser.len);
-    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
+    memcpy(copy, buf, len);
+    memset(copy + len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return current_structural >= &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 > n_structural_indexes;
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return current_structural == &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 == n_structural_indexes;
   }
-  really_inline bool at_beginning() {
-    return current_structural == parser.structural_indexes.get();
+  really_inline size_t next_structural_index() {
+    return next_structural;
   }
+
+  const uint8_t* const buf;
+  const size_t len;
+  const uint32_t* const structural_indexes;
+  size_t next_structural; // next structural index
+  size_t idx{0}; // location of the structural character in the input (buf)
+  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -6800,105 +6394,8 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
-namespace { // Make everything here private
-
-/* begin file src/generic/stage2/tape_writer.h */
-struct tape_writer {
-  /** The next place to write to tape */
-  uint64_t *next_tape_loc;
-  
-  /** Write a signed 64-bit value to tape. */
-  really_inline void append_s64(int64_t value) noexcept;
-
-  /** Write an unsigned 64-bit value to tape. */
-  really_inline void append_u64(uint64_t value) noexcept;
-
-  /** Write a double value to tape. */
-  really_inline void append_double(double value) noexcept;
-
-  /**
-   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
-   */
-  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
-
-  /**
-   * Skip the current tape entry without writing.
-   *
-   * Used to skip the start of the container, since we'll come back later to fill it in when the
-   * container ends.
-   */
-  really_inline void skip() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a large u64 or i64.
-   */
-  really_inline void skip_large_integer() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a double.
-   */
-  really_inline void skip_double() noexcept;
-
-  /**
-   * Write a value to a known location on tape.
-   *
-   * Used to go back and write out the start of a container after the container ends.
-   */
-  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
-
-private:
-  /**
-   * Append both the tape entry, and a supplementary value following it. Used for types that need
-   * all 64 bits, such as double and uint64_t.
-   */
-  template<typename T>
-  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
-}; // struct number_writer
-
-really_inline void tape_writer::append_s64(int64_t value) noexcept {
-  append2(0, value, internal::tape_type::INT64);
-}
-
-really_inline void tape_writer::append_u64(uint64_t value) noexcept {
-  append(0, internal::tape_type::UINT64);
-  *next_tape_loc = value;
-  next_tape_loc++;
-}
-
-/** Write a double value to tape. */
-really_inline void tape_writer::append_double(double value) noexcept {
-  append2(0, value, internal::tape_type::DOUBLE);
-}
-
-really_inline void tape_writer::skip() noexcept {
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::skip_large_integer() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::skip_double() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
-  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
-  next_tape_loc++;
-}
-
-template<typename T>
-really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
-  append(val, t);
-  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
-  memcpy(next_tape_loc, &val2, sizeof(val2));
-  next_tape_loc++;
-}
 
-really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
-  tape_loc = val | ((uint64_t(char(t))) << 56);
-}
-/* end file src/generic/stage2/tape_writer.h */
+using internal::ret_address;
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -6929,88 +6426,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address_t array_begin;
-  ret_address_t array_continue;
-  ret_address_t error;
-  ret_address_t finish;
-  ret_address_t object_begin;
-  ret_address_t object_continue;
+  ret_address array_begin;
+  ret_address array_continue;
+  ret_address error;
+  ret_address finish;
+  ret_address object_begin;
+  ret_address object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct structural_parser : structural_iterator {
-  /** Lets you append to the tape */
-  tape_writer tape;
+struct number_writer {
+  parser &doc_parser;
+  
+  really_inline void write_s64(int64_t value) noexcept {
+    write_tape(0, internal::tape_type::INT64);
+    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
+    ++doc_parser.current_loc;
+  }
+  really_inline void write_u64(uint64_t value) noexcept {
+    write_tape(0, internal::tape_type::UINT64);
+    doc_parser.doc.tape[doc_parser.current_loc++] = value;
+  }
+  really_inline void write_double(double value) noexcept {
+    write_tape(0, internal::tape_type::DOUBLE);
+    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
+    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
+    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
+  }
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+  }
+}; // struct number_writer
+
+struct structural_parser {
+  structural_iterator structurals;
+  parser &doc_parser;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc;
-  /** Current depth (nested objects and arrays) */
-  uint32_t depth{0};
-
-  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
-  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
-    : structural_iterator(_parser, start_structural_index),
-      tape{parser.doc->tape.get()},
-      current_string_buf_loc{parser.doc->string_buf.get()} {
-  }
-
-  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
-    parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc{};
+  uint32_t depth;
+
+  really_inline structural_parser(
+    const uint8_t *buf,
+    size_t len,
+    parser &_doc_parser,
+    uint32_t next_structural = 0
+  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+
+  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
+    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
+    doc_parser.containing_scope[depth].count = 0;
+    write_tape(0, type); // if the document is correct, this gets rewritten later
+    doc_parser.ret_address[depth] = continue_state;
     depth++;
-    bool exceeded_max_depth = depth >= parser.max_depth();
-    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
-    return exceeded_max_depth;
+    return depth >= doc_parser.max_depth();
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
-    log_start_value("document");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
+    return start_scope(internal::tape_type::ROOT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
-    log_start_value("object");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_OBJECT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
-    log_start_value("array");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_ARRAY, continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
+  really_inline void end_scope(internal::tape_type type) noexcept {
     depth--;
-    // write our doc->tape location to the header scope
+    // write our doc.tape location to the header scope
     // The root scope gets written *at* the previous location.
-    tape.append(parser.containing_scope[depth].tape_index, end);
+    write_tape(doc_parser.containing_scope[depth].tape_index, type);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
-    const uint32_t count = parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
+    const uint32_t count = doc_parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
-  }
-
-  really_inline uint32_t next_tape_index() {
-    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
+    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
   }
 
   really_inline void end_object() {
-    log_end_value("object");
-    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+    end_scope(internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    log_end_value("array");
-    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+    end_scope(internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    log_end_value("document");
-    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+    end_scope(internal::tape_type::ROOT);
+  }
+
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -7018,16 +6529,17 @@ struct structural_parser : structural_iterator {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    // we advance the point, accounting for the fact that we have a NULL termination
-    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+    /* we advance the point, accounting for the fact that we have a NULL
+      * termination         */
+    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline void on_end_string(uint8_t *dst) noexcept {
+  really_inline bool on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -7037,49 +6549,73 @@ struct structural_parser : structural_iterator {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
+    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string(bool key = false) {
-    log_value(key ? "key" : "string");
+  WARN_UNUSED really_inline bool parse_string() {
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(current(), dst);
+    dst = stringparsing::parse_string(structurals.current(), dst);
     if (dst == nullptr) {
-      log_error("Invalid escape in string");
       return true;
     }
-    on_end_string(dst);
-    return false;
+    return !on_end_string(dst);
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    log_value("number");
-    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
-    if (!succeeded) { log_error("Invalid number"); }
-    return !succeeded;
+    number_writer writer{doc_parser};
+    return !numberparsing::parse_number(src, found_minus, writer);
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(current(), found_minus);
+    return parse_number(structurals.current(), found_minus);
+  }
+
+  WARN_UNUSED really_inline bool parse_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
+  }
+
+  WARN_UNUSED really_inline bool parse_single_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
   }
 
-  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
-    switch (advance_char()) {
+  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
+    switch (structurals.current_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't':
-      log_value("true");
-      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
-      tape.append(0, internal::tape_type::TRUE_VALUE);
-      return continue_state;
-    case 'f':
-      log_value("false");
-      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
-      tape.append(0, internal::tape_type::FALSE_VALUE);
-      return continue_state;
-    case 'n':
-      log_value("null");
-      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
-      tape.append(0, internal::tape_type::NULL_VALUE);
+    case 't': case 'f': case 'n':
+      FAIL_IF( parse_atom() );
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -7095,27 +6631,40 @@ struct structural_parser : structural_iterator {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
-      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
+    // the string might not be NULL terminated.
+    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
     end_document();
-    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
-
     if (depth != 0) {
-      log_error("Unclosed objects or arrays!");
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
     }
 
-    return SUCCESS;
+    return on_success(SUCCESS);
+  }
+
+  really_inline error_code on_error(error_code new_error_code) noexcept {
+    doc_parser.error = new_error_code;
+    return new_error_code;
+  }
+  really_inline error_code on_success(error_code success_code) noexcept {
+    doc_parser.error = success_code;
+    doc_parser.valid = true;
+    return success_code;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by parser.init_stage2(),
+    /* We do not need the next line because this is done by doc_parser.init_stage2(),
     * pessimistically.
-    * parser.is_valid  = false;
+    * doc_parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -7123,12 +6672,12 @@ struct structural_parser : structural_iterator {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= parser.max_depth()) {
-      return parser.error = DEPTH_ERROR;
+    if (depth >= doc_parser.max_depth()) {
+      return on_error(DEPTH_ERROR);
     }
-    switch (current_char()) {
+    switch (structurals.current_char()) {
     case '"':
-      return parser.error = STRING_ERROR;
+      return on_error(STRING_ERROR);
     case '0':
     case '1':
     case '2':
@@ -7140,124 +6689,92 @@ struct structural_parser : structural_iterator {
     case '8':
     case '9':
     case '-':
-      return parser.error = NUMBER_ERROR;
+      return on_error(NUMBER_ERROR);
     case 't':
-      return parser.error = T_ATOM_ERROR;
+      return on_error(T_ATOM_ERROR);
     case 'n':
-      return parser.error = N_ATOM_ERROR;
+      return on_error(N_ATOM_ERROR);
     case 'f':
-      return parser.error = F_ATOM_ERROR;
+      return on_error(F_ATOM_ERROR);
     default:
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
     }
   }
 
   really_inline void init() {
-    log_start();
-    parser.error = UNINITIALIZED;
+    current_string_buf_loc = doc_parser.doc.string_buf.get();
+    doc_parser.current_loc = 0;
+    doc_parser.valid = false;
+    doc_parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
-    // If there are no structurals left, return EMPTY
-    if (at_end(parser.n_structural_indexes)) {
-      return parser.error = EMPTY;
+  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
+    init(); // sets is_valid to false
+    if (len > doc_parser.capacity()) {
+      return CAPACITY;
     }
-
-    init();
+    // Advance to the first character as soon as possible
+    structurals.advance_char();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return parser.error = DEPTH_ERROR;
+      return on_error(DEPTH_ERROR);
     }
     return SUCCESS;
   }
 
-  really_inline void log_value(const char *type) {
-    logger::log_line(*this, "", type, "");
-  }
-
-  static really_inline void log_start() {
-    logger::log_start();
-  }
-
-  really_inline void log_start_value(const char *type) {
-    logger::log_line(*this, "+", type, "");
-    if (logger::LOG_ENABLED) { logger::log_depth++; }
-  }
-
-  really_inline void log_end_value(const char *type) {
-    if (logger::LOG_ENABLED) { logger::log_depth--; }
-    logger::log_line(*this, "-", type, "");
-  }
-
-  really_inline void log_error(const char *error) {
-    logger::log_line(*this, "", "ERROR", error);
+  really_inline char advance_char() {
+    return structurals.advance_char();
   }
-}; // struct structural_parser
+};
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-template<bool STREAMING>
-WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
-  dom_parser.doc = &doc;
+} // namespace stage2
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
-  error_code result = parser.start(addresses.finish);
+  stage2::structural_parser parser(buf, len, doc_parser);
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.current_char()) {
+  switch (parser.structurals.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
-    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
-    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
-    if (!STREAMING) {
-      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
-        goto error;
-      }
-    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't':
-    parser.log_value("true");
-    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
-    goto finish;
-  case 'f':
-    parser.log_value("false");
-    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
-    goto finish;
-  case 'n':
-    parser.log_value("null");
-    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::NULL_VALUE);
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
-    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -7268,45 +6785,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.advance_char();
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.parser.ret_address[parser.depth] );
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.peek_next_char() == ']') {
-    parser.advance_char();
+  if (parser.advance_char() == ']') {
     parser.end_array();
     goto scope_end;
   }
@@ -7321,12 +6836,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
+    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
-    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -7337,191 +6852,178 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   return parser.error();
 }
 
-} // namespace {}
-} // namespace stage2
+WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+  error_code code = stage1(buf, len, doc_parser, false);
+  if (!code) {
+    code = stage2(buf, len, doc_parser);
+  }
+  return code;
+}
+/* end file src/generic/stage2/structural_parser.h */
+/* begin file src/generic/stage2/streaming_structural_parser.h */
+namespace stage2 {
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
-  error_code result = stage2::parse_structurals<false>(*this, _doc);
-  if (result) { return result; }
+struct streaming_structural_parser: structural_parser {
+  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
 
-  // If we didn't make it to the end, it's an error
-  if ( next_structural_index != n_structural_indexes ) {
-    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
-    return error = TAPE_ERROR;
+  // override to add streaming
+  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
+    init(); // sets is_valid to false
+    // Capacity ain't no thang for streaming, so we don't check it.
+    // Advance to the first character as soon as possible
+    advance_char();
+    // Push the root scope (there is always at least one scope)
+    if (start_document(finish_parser)) {
+      return on_error(DEPTH_ERROR);
+    }
+    return SUCCESS;
   }
 
-  return SUCCESS;
-}
+  // override to add streaming
+  WARN_UNUSED really_inline error_code finish() {
+    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
+    end_document();
+    if (depth != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
+    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
+  }
+};
+
+} // namespace stage2
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
-  return stage2::parse_structurals<true>(*this, _doc);
-}
-/* end file src/generic/stage2/tape_writer.h */
-
-WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
-  error_code err = stage1(_buf, _len, false);
-  if (err) { return err; }
-  return stage2(_doc);
-}
-
-} // namespace fallback
-} // namespace simdjson
-/* end file src/generic/stage2/tape_writer.h */
-#endif
-#if SIMDJSON_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/implementation.cpp */
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
-/* begin file src/haswell/dom_parser_implementation.h */
-#ifndef SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
-#define SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
-
-/* isadetection.h already included: #include "isadetection.h" */
-
-namespace simdjson {
-namespace haswell {
-
-/* begin file src/generic/dom_parser_implementation.h */
-// expectation: sizeof(scope_descriptor) = 64/8.
-struct scope_descriptor {
-  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
-  uint32_t count; // how many elements in the scope
-}; // struct scope_descriptor
-
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address_t;
-#else
-typedef char ret_address_t;
-#endif
-
-class dom_parser_implementation final : public internal::dom_parser_implementation {
-public:
-  /** Tape location of each open { or [ */
-  std::unique_ptr<scope_descriptor[]> containing_scope{};
-  /** Return address of each open { or [ */
-  std::unique_ptr<ret_address_t[]> ret_address{};
-  /** Buffer passed to stage 1 */
-  const uint8_t *buf{};
-  /** Length passed to stage 1 */
-  size_t len{0};
-  /** Document passed to stage 2 */
-  dom::document *doc{};
-  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
-  error_code error{UNINITIALIZED};
-
-  really_inline dom_parser_implementation();
-  dom_parser_implementation(const dom_parser_implementation &) = delete;
-  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
-
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
-  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
-  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
-  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
-};
-
-/* begin file src/generic/stage1/allocate.h */
-namespace stage1 {
-namespace allocate {
-
-//
-// Allocates stage 1 internal state and outputs in the parser
-//
-really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
-  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
-  if (!parser.structural_indexes) { return MEMALLOC; }
-  parser.structural_indexes[0] = 0;
-  parser.n_structural_indexes = 0;
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage1
-/* end file src/generic/stage1/allocate.h */
-/* begin file src/generic/stage2/allocate.h */
-namespace stage2 {
-namespace allocate {
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
+  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
+  error_code result = parser.start(len, addresses.finish);
+  if (result) { return result; }
+  //
+  // Read first value
+  //
+  switch (parser.structurals.current_char()) {
+  case '{':
+    FAIL_IF( parser.start_object(addresses.finish) );
+    goto object_begin;
+  case '[':
+    FAIL_IF( parser.start_array(addresses.finish) );
+    goto array_begin;
+  case '"':
+    FAIL_IF( parser.parse_string() );
+    goto finish;
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
+    goto finish;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], false);
+      })
+    );
+    goto finish;
+  case '-':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], true);
+      })
+    );
+    goto finish;
+  default:
+    goto error;
+  }
 
 //
-// Allocates stage 2 internal state and outputs in the parser
+// Object parser parsers
 //
-really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
-  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
-  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
-
-  if (!parser.ret_address || !parser.containing_scope) {
-    return MEMALLOC;
+object_begin:
+  switch (parser.advance_char()) {
+  case '"': {
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  }
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
   }
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage2
-/* end file src/generic/stage2/allocate.h */
 
-really_inline dom_parser_implementation::dom_parser_implementation() {}
+object_key_parser:
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.increment_count();
+  parser.advance_char();
+  GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
-// Leaving these here so they can be inlined if so desired
-WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
-  error_code err = stage1::allocate::set_capacity(*this, capacity);
-  if (err) { _capacity = 0; return err; }
-  _capacity = capacity;
-  return SUCCESS;
-}
+object_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
+  }
 
-WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
-  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
-  if (err) { _max_depth = 0; return err; }
-  _max_depth = max_depth;
-  return SUCCESS;
-}
-/* end file src/generic/stage2/allocate.h */
+scope_end:
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
-} // namespace haswell
-} // namespace simdjson
+//
+// Array parser parsers
+//
+array_begin:
+  if (parser.advance_char() == ']') {
+    parser.end_array();
+    goto scope_end;
+  }
+  parser.increment_count();
 
-#endif // SIMDJSON_HASWELL_DOM_PARSER_IMPLEMENTATION_H
-/* end file src/generic/stage2/allocate.h */
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  GOTO( parser.parse_value(addresses, addresses.array_continue) );
 
-TARGET_HASWELL
+array_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    parser.increment_count();
+    parser.advance_char();
+    goto main_array_switch;
+  case ']':
+    parser.end_array();
+    goto scope_end;
+  default:
+    goto error;
+  }
 
-namespace simdjson {
-namespace haswell {
+finish:
+  next_json = parser.structurals.next_structural_index();
+  return parser.finish();
 
-WARN_UNUSED error_code implementation::create_dom_parser_implementation(
-  size_t capacity,
-  size_t max_depth,
-  std::unique_ptr<internal::dom_parser_implementation>& dst
-) const noexcept {
-  dst.reset( new (std::nothrow) dom_parser_implementation() );
-  if (!dst) { return MEMALLOC; }
-  dst->set_capacity(capacity);
-  dst->set_max_depth(max_depth);
-  return SUCCESS;
+error:
+  return parser.error();
 }
+/* end file src/generic/stage2/streaming_structural_parser.h */
 
-} // namespace haswell
+} // namespace fallback
 } // namespace simdjson
+/* end file src/generic/stage2/streaming_structural_parser.h */
+#endif
+#if SIMDJSON_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/stage1.cpp */
 
-UNTARGET_REGION
-/* end file src/generic/stage2/allocate.h */
-/* begin file src/haswell/dom_parser_implementation.cpp */
-/* haswell/implementation.h already included: #include "haswell/implementation.h" */
-/* haswell/dom_parser_implementation.h already included: #include "haswell/dom_parser_implementation.h" */
-
-//
-// Stage 1
-//
 /* begin file src/haswell/bitmask.h */
 #ifndef SIMDJSON_HASWELL_BITMASK_H
 #define SIMDJSON_HASWELL_BITMASK_H
@@ -8066,6 +7568,7 @@ UNTARGET_REGION
 #endif // SIMDJSON_HASWELL_SIMD_H
 /* end file src/haswell/bitmanipulation.h */
 /* haswell/bitmanipulation.h already included: #include "haswell/bitmanipulation.h" */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
 
 TARGET_HASWELL
 namespace simdjson {
@@ -8124,21 +7627,24 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  really_inline size_t block_index();
-  really_inline bool has_full_block() const;
-  really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  really_inline size_t get_remainder(uint8_t *dst) const;
-  really_inline void advance();
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+  really_inline size_t block_index() { return idx; }
+  really_inline bool has_full_block() const {
+    return idx < lenminusstep;
+  }
+  really_inline const uint8_t *full_block() const {
+    return &buf[idx];
+  }
+  really_inline bool has_remainder() const {
+    return idx < len;
+  }
+  really_inline void get_remainder(uint8_t *tmp_buf) const {
+    memset(tmp_buf, 0x20, STEP_SIZE);
+    memcpy(tmp_buf, buf + idx, len - idx);
+  }
+  really_inline void advance() {
+    idx += STEP_SIZE;
+  }
 private:
   const uint8_t *buf;
   const size_t len;
@@ -8146,18 +7652,6 @@ struct buf_block_reader {
   size_t idx;
 };
 
-constexpr const int TITLE_SIZE = 12;
-
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
-
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -8177,34 +7671,6 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
-
-template<size_t STEP_SIZE>
-really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
-
-template<size_t STEP_SIZE>
-really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
-}
-
-template<size_t STEP_SIZE>
-really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
-}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  memcpy(dst, buf + idx, len - idx);
-  return len - idx;
-}
-
-template<size_t STEP_SIZE>
-really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
-}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -8504,15 +7970,13 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
-
-  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  // Index the last (remainder) block, padded with spaces
-  uint8_t block[STEP_SIZE];
-  if (likely(reader.get_remainder(block)) > 0) {
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -8525,94 +7989,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
 }
 
-/* begin file src/generic/stage1/find_next_document_index.h */
-/**
-  * This algorithm is used to quickly identify the last structural position that
-  * makes up a complete document.
-  *
-  * It does this by going backwards and finding the last *document boundary* (a
-  * place where one value follows another without a comma between them). If the
-  * last document (the characters after the boundary) has an equal number of
-  * start and end brackets, it is considered complete.
-  *
-  * Simply put, we iterate over the structural characters, starting from
-  * the end. We consider that we found the end of a JSON document when the
-  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
-  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
-  *
-  * This simple comparison works most of the time, but it does not cover cases
-  * where the batch's structural indexes contain a perfect amount of documents.
-  * In such a case, we do not have access to the structural index which follows
-  * the last document, therefore, we do not have access to the second element in
-  * the pair, and that means we cannot identify the last document. To fix this
-  * issue, we keep a count of the open and closed curly/square braces we found
-  * while searching for the pair. When we find a pair AND the count of open and
-  * closed curly/square braces is the same, we know that we just passed a
-  * complete document, therefore the last json buffer location is the end of the
-  * batch.
-  */
-really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
-  // TODO don't count separately, just figure out depth
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (parser.buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (parser.buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    // Last document is complete, so the next document will appear after!
-    if (!arr_cnt && !obj_cnt) {
-      return parser.n_structural_indexes;
-    }
-    // Last document is incomplete; mark the document at i + 1 as the next one
-    return i;
-  }
-  return 0;
-}
-
-// Skip the last character if it is partial
-really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
-  if (unlikely(len < 3)) {
-    switch (len) {
-      case 2:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
-        return len;
-      case 1:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        return len;
-      case 0:
-        return len;
-    }
-  }
-  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
-  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
-  return len;
-}
-/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -8663,9 +8039,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//
+//   
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-//
+// 
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -8680,7 +8056,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 using namespace simd;
 
 namespace utf8_validation {
-  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -8725,7 +8100,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,
+      0, 0, 0, 0,                          
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -8756,6 +8131,214 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
+  //
+  // Validate the length of multibyte characters (that each multibyte character has the right number
+  // of continuation characters, and that all continuation characters are part of a multibyte
+  // character).
+  //
+  // Algorithm
+  // =========
+  //
+  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
+  // and emits an error anytime there is a mismatch.
+  //
+  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // characters, the file will look like this:
+  //
+  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
+  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
+  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
+  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
+  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
+  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
+  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  //
+  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
+  //
+  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
+  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
+  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
+  //   or maybe it's at the beginning of the file before any characters have started; but it's an
+  //   error in all these cases.
+  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
+  //   we started a new character before we were finished with the current one.
+  //
+  // Getting the Previous Bytes
+  // --------------------------
+  //
+  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
+  // character, we need to "shift the bytes" to find that out. This is what they mean:
+  //
+  // - `is_continuation`: if the current byte is a continuation.
+  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
+  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
+  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
+  //
+  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
+  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
+  // function, because the 1-byte-back data is used by other checks as well.
+  //
+  // Getting the Continuation Mask
+  // -----------------------------
+  //
+  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
+  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
+  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
+  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
+  //
+  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
+  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
+  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
+  //
+  // When treated as signed numbers, they look like this:
+  //
+  // | Type         | High Bits  | Binary Range | Signed |
+  // |--------------|------------|--------------|--------|
+  // | ASCII        | `0`        | `01111111`   |   127  |
+  // |              |            | `00000000`   |     0  |
+  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
+  // |              |            | `11110000    |   -16  |
+  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
+  // |              |            | `11100000    |   -32  |
+  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
+  // |              |            | `11000000    |   -64  |
+  // | Continuation | `10`       | `10111111`   |   -65  |
+  // |              |            | `10000000    |  -128  |
+  //
+  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
+  //
+  // ```
+  // is_continuation = input < -64`
+  // ```
+  //
+  // We can do something similar for the others, but it takes two comparisons instead of one: "is
+  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
+  // `> -64`. Surely we can do better, they're right next to each other!
+  //
+  // Getting the is_xxx Masks: Shifting the Range
+  // --------------------------------------------
+  //
+  // Notice *why* continuations were a single comparison. The actual *range* would require two
+  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
+  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
+  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
+  //
+  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
+  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
+  //
+  // | Type                 | High Bits  | Binary Range | Signed |
+  // |----------------------|------------|--------------|-------|
+  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
+  // |                      |            | `01110000    |   112 |
+  // |----------------------|------------|--------------|-------|
+  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
+  // |                      |            | `01100000    |    96 |
+  // |----------------------|------------|--------------|-------|
+  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
+  // |                      |            | `01000000    |    64 |
+  // |----------------------|------------|--------------|-------|
+  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
+  // |                      |            | `00000000    |     0 |
+  // |----------------------|------------|--------------|-------|
+  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
+  // |                      |            | `10000000`   |  -128 |
+  // |----------------------|------------|--------------|-------|
+  // 
+  // *Now* we can use signed `>` on all of them:
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev2 = input.prev<2>
+  // prev3 = input.prev<3>
+  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
+  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
+  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
+  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
+  // ```
+  //
+  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
+  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
+  // `^`'s at a time on Haswell, but only 2 `+`'s).
+  //
+  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
+  // same number to all of them, we can save one of those `+ 128` operations by assembling
+  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
+  // to it. One more instruction saved!
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev3 = input.prev<3>
+  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
+  // ```
+  //
+  // ### Bringing It All Together: Detecting the Errors
+  //
+  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
+  // All we have left to do is check if they match!
+  //
+  // ```
+  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
+  // ```
+  //
+  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
+  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
+  // bitwise operations, and we're only using 1!
+  //
+  // Epilogue: Addition For Booleans
+  // -------------------------------
+  //
+  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
+  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
+  // to each other (or any combination), and the continuation could be part of either of them!
+  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
+  //
+  // Never fear, though. If that situation occurs, we'll already have detected that the second
+  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
+  // character, but it *wasn't a continuation*.
+  //
+  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
+  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
+  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
+  // comparisons were giving us numbers!
+  //
+  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
+  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
+  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
+  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
+  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
+  //
+  // Further, if *more than one* multibyte character overlaps,
+  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
+  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
+  // considered an error.
+  //
+  // One reason you might want to do this is parallelism. ^ and | are not associative, so
+  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
+  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
+  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
+  // then adds the result together. Same number of operations, but if the processor can run
+  // independent things in parallel (which most can), it runs faster.
+  //
+  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
+  // a super nice advantage in that more of them can be run at the same time (they can run on 3
+  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
+  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
+  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
+  // out of the way," running on a port other instructions can't.
+  // 
+  // Epilogue II: One More Trick
+  // ---------------------------
+  //
+  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
+  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
+  // check_special_cases()--but we'll talk about that there :)
+  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -8893,22 +8476,16 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
-  /**
-   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-   *
-   * @param partial Setting the partial parameter to true allows the find_structural_bits to
-   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
-   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
-   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes);
+  really_inline json_structural_indexer(uint32_t *structural_indexes)
+  : indexer{structural_indexes} {}
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
+  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -8917,8 +8494,65 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
+
+really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
+
+  error_code error = scanner.finish(streaming);
+  if (unlikely(error != SUCCESS)) { return error; }
+
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
+  }
+
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending character. */
+    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
+  }
+  /* make it safe to dereference one beyond this array */
+  parser.structural_indexes[parser.n_structural_indexes] = 0;
+  return checker.errors();
+}
+
+template<>
+really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  simd::simd8x64<uint8_t> in_2(block+64);
+  json_block block_1 = scanner.next(in_1);
+  json_block block_2 = scanner.next(in_2);
+  this->next(in_1, block_1, reader.block_index());
+  this->next(in_2, block_2, reader.block_index()+64);
+  reader.advance();
+}
+
+template<>
+really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
+  simd::simd8x64<uint8_t> in_1(block);
+  json_block block_1 = scanner.next(in_1);
+  this->next(in_1, block_1, reader.block_index());
+  reader.advance();
+}
 
+//
+// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
 //
 // PERF NOTES:
 // We pipe 2 inputs through these stages:
@@ -8936,116 +8570,41 @@ really_inline json_structural_indexer::json_structural_indexer(uint32_t *structu
 // available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
 // workout.
 //
+// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
+// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
+// you may want to call on a function like trimmed_length_safe_utf8.
 template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
   if (unlikely(len > parser.capacity())) { return CAPACITY; }
-  if (partial) { len = trim_partial_utf8(buf, len); }
 
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_structural_indexer indexer(parser.structural_indexes.get());
-
-  // Read all but the last block
   while (reader.has_full_block()) {
     indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  // Take care of the last block (will always be there unless file is empty)
-  uint8_t block[STEP_SIZE];
-  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
-  indexer.step<STEP_SIZE>(block, reader);
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    indexer.step<STEP_SIZE>(block, reader);
+  }
 
-  return indexer.finish(parser, reader.block_index(), len, partial);
-}
-
-template<>
-really_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  simd::simd8x64<uint8_t> in_2(block+64);
-  json_block block_1 = scanner.next(in_1);
-  json_block block_2 = scanner.next(in_2);
-  this->next(in_1, block_1, reader.block_index());
-  this->next(in_2, block_2, reader.block_index()+64);
-  reader.advance();
-}
-
-template<>
-really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
-  simd::simd8x64<uint8_t> in_1(block);
-  json_block block_1 = scanner.next(in_1);
-  this->next(in_1, block_1, reader.block_index());
-  reader.advance();
-}
-
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
-
-really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
-
-  error_code error = scanner.finish(partial);
-  if (unlikely(error != SUCCESS)) { return error; }
-
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
-  }
-
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /***
-   * This is related to https://github.com/simdjson/simdjson/issues/906
-   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
-   * structural character, it quickly stops.
-   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
-   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
-   * continues, then it must be [,] or }.
-   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
-   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
-   * It can't be a comma, a colon or any simple value. So the only way we could continue is
-   * if the repeated character is [. But if so, the document must start with [. But if the document
-   * starts with [, it should end with ]. If we enforce that rule, then we would get
-   * ][[ which is invalid.
-   **/
-  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
-  parser.next_structural_index = 0;
-  // a valid JSON file cannot have zero structural indexes - we should have found something
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
-  }
-  if (partial) {
-    auto new_structural_indexes = find_next_document_index(parser);
-    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
-      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
-    }
-    parser.n_structural_indexes = new_structural_indexes;
-  }
-  return checker.errors();
+  return indexer.finish(parser, reader.block_index(), len, streaming);
 }
 
 } // namespace stage1
 /* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
-  this->buf = _buf;
-  this->len = _len;
-  return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
+WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
+  return haswell::stage1::json_structural_indexer::index<128>(buf, len, parser, streaming);
 }
 
 } // namespace haswell
+
 } // namespace simdjson
 UNTARGET_REGION
-
-//
-// Stage 2
-//
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/haswell/stage2.cpp */
+/* haswell/implementation.h already included: #include "haswell/implementation.h" */
 /* begin file src/haswell/stringparsing.h */
 #ifndef SIMDJSON_HASWELL_STRINGPARSING_H
 #define SIMDJSON_HASWELL_STRINGPARSING_H
@@ -9456,10 +9015,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -9479,6 +9038,26 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
+// We need to check that the character following a zero is valid. This is
+// probably frequent and it is harder than it looks. We are building all of this
+// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
+const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+really_inline bool
+is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
+}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -9556,14 +9135,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -9576,12 +9155,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.append_s64(i);
+      writer.write_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.append_u64(i);
+      writer.write_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -9591,7 +9170,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.append_double(d);
+    writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -9615,10 +9194,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W &writer) {
+                                W writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.append_s64(0);        // always write zero
+  writer.write_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -9638,7 +9217,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_integer(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -9762,7 +9341,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (unlikely(digit_count >= 19)) {
+      if (digit_count >= 19) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -9770,22 +9349,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        bool success = slow_float_parsing((const char *) src, writer);
-        // The number was already written, but we made a copy of the writer
-        // when we passed it to the parse_large_integer() function, so 
-        writer.skip_double();
-        return success;
+        return slow_float_parsing((const char *) src, writer);
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      bool success = slow_float_parsing((const char *) src, writer);
-      // The number was already written, but we made a copy of the writer when we passed it to the
-      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
-      writer.skip_double();
-      return success;
+      return slow_float_parsing((const char *) src, writer);
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -9794,7 +9365,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.append_double(d);
+      writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -9809,14 +9380,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      bool success = parse_large_integer(src, writer, found_minus);
-      // The number was already written, but we made a copy of the writer
-      // when we passed it to the parse_large_integer() function, so 
-      writer.skip_large_integer();
-      return success;
+      return parse_large_integer(src, writer, found_minus);
     }
     i = negative ? 0 - i : i;
-    writer.append_s64(i);
+    writer.write_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -9841,72 +9408,6 @@ TARGET_HASWELL
 namespace simdjson {
 namespace haswell {
 
-/* begin file src/generic/stage2/logger.h */
-// This is for an internal-only stage 2 specific logger.
-// Set LOG_ENABLED = true to log what stage 2 is doing!
-namespace logger {
-  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
-
-  static constexpr const bool LOG_ENABLED = false;
-  static constexpr const int LOG_EVENT_LEN = 30;
-  static constexpr const int LOG_BUFFER_LEN = 20;
-  static constexpr const int LOG_DETAIL_LEN = 50;
-  static constexpr const int LOG_INDEX_LEN = 10;
-
-  static int log_depth; // Not threadsafe. Log only.
-
-  // Helper to turn unprintable or newline characters into spaces
-  static really_inline char printable_char(char c) {
-    if (c >= 0x20) {
-      return c;
-    } else {
-      return ' ';
-    }
-  }
-
-  // Print the header and set up log_start
-  static really_inline void log_start() {
-    if (LOG_ENABLED) {
-      log_depth = 0;
-      printf("\n");
-      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
-      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
-    }
-  }
-
-  static really_inline void log_string(const char *message) {
-    if (LOG_ENABLED) {
-      printf("%s\n", message);
-    }
-  }
-
-  // Logs a single line of 
-  template<typename S>
-  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
-    if (LOG_ENABLED) {
-      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
-      {
-        // Print the next N characters in the buffer.
-        printf("| ");
-        // Otherwise, print the characters starting from the buffer position.
-        // Print spaces for unprintable or newline characters.
-        for (int i=0;i<LOG_BUFFER_LEN;i++) {
-          printf("%c", printable_char(structurals.current()[i]));
-        }
-        printf(" ");
-      }
-      printf("|    %c ", printable_char(structurals.current_char()));
-      printf("|    %c ", printable_char(structurals.peek_next_char()));
-      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
-      printf("| %5u ", structurals.next_tape_index());
-      printf("| %-*s ", LOG_DETAIL_LEN, detail);
-      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
-      printf("|\n");
-    }
-  }
-} // namespace logger
-
-/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -9965,34 +9466,26 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  const uint8_t* const buf;
-  uint32_t *current_structural;
-  dom_parser_implementation &parser;
-
-  // Start a structural 
-  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
-    : buf{_parser.buf},
-      current_structural{&_parser.structural_indexes[start_structural_index]},
-      parser{_parser} {
-  }
-  // Get the buffer position of the current structural character
-  really_inline const uint8_t* current() {
-    return &buf[*current_structural];
+  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
+    : buf{_buf},
+     len{_len},
+     structural_indexes{_structural_indexes},
+     next_structural{next_structural_index}
+    {}
+  really_inline char advance_char() {
+    idx = structural_indexes[next_structural];
+    next_structural++;
+    c = *current();
+    return c;
   }
-  // Get the current structural character
   really_inline char current_char() {
-    return buf[*current_structural];
+    return c;
   }
-  // Get the next structural character without advancing
-  really_inline char peek_next_char() {
-    return buf[*(current_structural+1)];
-  }
-  really_inline char advance_char() {
-    current_structural++;
-    return buf[*current_structural];
+  really_inline const uint8_t* current() {
+    return &buf[idx];
   }
   really_inline size_t remaining_len() {
-    return parser.len - *current_structural;
+    return len - idx;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -10009,25 +9502,32 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, parser.len);
-    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
+    memcpy(copy, buf, len);
+    memset(copy + len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return current_structural >= &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 > n_structural_indexes;
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return current_structural == &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 == n_structural_indexes;
   }
-  really_inline bool at_beginning() {
-    return current_structural == parser.structural_indexes.get();
+  really_inline size_t next_structural_index() {
+    return next_structural;
   }
+
+  const uint8_t* const buf;
+  const size_t len;
+  const uint32_t* const structural_indexes;
+  size_t next_structural; // next structural index
+  size_t idx{0}; // location of the structural character in the input (buf)
+  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -10039,105 +9539,8 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
-namespace { // Make everything here private
-
-/* begin file src/generic/stage2/tape_writer.h */
-struct tape_writer {
-  /** The next place to write to tape */
-  uint64_t *next_tape_loc;
-  
-  /** Write a signed 64-bit value to tape. */
-  really_inline void append_s64(int64_t value) noexcept;
-
-  /** Write an unsigned 64-bit value to tape. */
-  really_inline void append_u64(uint64_t value) noexcept;
-
-  /** Write a double value to tape. */
-  really_inline void append_double(double value) noexcept;
-
-  /**
-   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
-   */
-  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
-
-  /**
-   * Skip the current tape entry without writing.
-   *
-   * Used to skip the start of the container, since we'll come back later to fill it in when the
-   * container ends.
-   */
-  really_inline void skip() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a large u64 or i64.
-   */
-  really_inline void skip_large_integer() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a double.
-   */
-  really_inline void skip_double() noexcept;
-
-  /**
-   * Write a value to a known location on tape.
-   *
-   * Used to go back and write out the start of a container after the container ends.
-   */
-  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
-
-private:
-  /**
-   * Append both the tape entry, and a supplementary value following it. Used for types that need
-   * all 64 bits, such as double and uint64_t.
-   */
-  template<typename T>
-  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
-}; // struct number_writer
-
-really_inline void tape_writer::append_s64(int64_t value) noexcept {
-  append2(0, value, internal::tape_type::INT64);
-}
-
-really_inline void tape_writer::append_u64(uint64_t value) noexcept {
-  append(0, internal::tape_type::UINT64);
-  *next_tape_loc = value;
-  next_tape_loc++;
-}
-
-/** Write a double value to tape. */
-really_inline void tape_writer::append_double(double value) noexcept {
-  append2(0, value, internal::tape_type::DOUBLE);
-}
-
-really_inline void tape_writer::skip() noexcept {
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::skip_large_integer() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::skip_double() noexcept {
-  next_tape_loc += 2;
-}
 
-really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
-  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
-  next_tape_loc++;
-}
-
-template<typename T>
-really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
-  append(val, t);
-  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
-  memcpy(next_tape_loc, &val2, sizeof(val2));
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
-  tape_loc = val | ((uint64_t(char(t))) << 56);
-}
-/* end file src/generic/stage2/tape_writer.h */
+using internal::ret_address;
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -10168,88 +9571,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address_t array_begin;
-  ret_address_t array_continue;
-  ret_address_t error;
-  ret_address_t finish;
-  ret_address_t object_begin;
-  ret_address_t object_continue;
+  ret_address array_begin;
+  ret_address array_continue;
+  ret_address error;
+  ret_address finish;
+  ret_address object_begin;
+  ret_address object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct structural_parser : structural_iterator {
-  /** Lets you append to the tape */
-  tape_writer tape;
+struct number_writer {
+  parser &doc_parser;
+  
+  really_inline void write_s64(int64_t value) noexcept {
+    write_tape(0, internal::tape_type::INT64);
+    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
+    ++doc_parser.current_loc;
+  }
+  really_inline void write_u64(uint64_t value) noexcept {
+    write_tape(0, internal::tape_type::UINT64);
+    doc_parser.doc.tape[doc_parser.current_loc++] = value;
+  }
+  really_inline void write_double(double value) noexcept {
+    write_tape(0, internal::tape_type::DOUBLE);
+    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
+    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
+    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
+  }
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+  }
+}; // struct number_writer
+
+struct structural_parser {
+  structural_iterator structurals;
+  parser &doc_parser;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc;
-  /** Current depth (nested objects and arrays) */
-  uint32_t depth{0};
-
-  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
-  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
-    : structural_iterator(_parser, start_structural_index),
-      tape{parser.doc->tape.get()},
-      current_string_buf_loc{parser.doc->string_buf.get()} {
-  }
-
-  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
-    parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc{};
+  uint32_t depth;
+
+  really_inline structural_parser(
+    const uint8_t *buf,
+    size_t len,
+    parser &_doc_parser,
+    uint32_t next_structural = 0
+  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+
+  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
+    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
+    doc_parser.containing_scope[depth].count = 0;
+    write_tape(0, type); // if the document is correct, this gets rewritten later
+    doc_parser.ret_address[depth] = continue_state;
     depth++;
-    bool exceeded_max_depth = depth >= parser.max_depth();
-    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
-    return exceeded_max_depth;
+    return depth >= doc_parser.max_depth();
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
-    log_start_value("document");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
+    return start_scope(internal::tape_type::ROOT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
-    log_start_value("object");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_OBJECT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
-    log_start_value("array");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_ARRAY, continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
+  really_inline void end_scope(internal::tape_type type) noexcept {
     depth--;
-    // write our doc->tape location to the header scope
+    // write our doc.tape location to the header scope
     // The root scope gets written *at* the previous location.
-    tape.append(parser.containing_scope[depth].tape_index, end);
+    write_tape(doc_parser.containing_scope[depth].tape_index, type);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
-    const uint32_t count = parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
+    const uint32_t count = doc_parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
-  }
-
-  really_inline uint32_t next_tape_index() {
-    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
+    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
   }
 
   really_inline void end_object() {
-    log_end_value("object");
-    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+    end_scope(internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    log_end_value("array");
-    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+    end_scope(internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    log_end_value("document");
-    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+    end_scope(internal::tape_type::ROOT);
+  }
+
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -10257,16 +9674,17 @@ struct structural_parser : structural_iterator {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    // we advance the point, accounting for the fact that we have a NULL termination
-    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+    /* we advance the point, accounting for the fact that we have a NULL
+      * termination         */
+    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline void on_end_string(uint8_t *dst) noexcept {
+  really_inline bool on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -10276,49 +9694,73 @@ struct structural_parser : structural_iterator {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
+    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string(bool key = false) {
-    log_value(key ? "key" : "string");
+  WARN_UNUSED really_inline bool parse_string() {
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(current(), dst);
+    dst = stringparsing::parse_string(structurals.current(), dst);
     if (dst == nullptr) {
-      log_error("Invalid escape in string");
       return true;
     }
-    on_end_string(dst);
-    return false;
+    return !on_end_string(dst);
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    log_value("number");
-    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
-    if (!succeeded) { log_error("Invalid number"); }
-    return !succeeded;
+    number_writer writer{doc_parser};
+    return !numberparsing::parse_number(src, found_minus, writer);
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(current(), found_minus);
+    return parse_number(structurals.current(), found_minus);
+  }
+
+  WARN_UNUSED really_inline bool parse_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
+  }
+
+  WARN_UNUSED really_inline bool parse_single_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
   }
 
-  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
-    switch (advance_char()) {
+  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
+    switch (structurals.current_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't':
-      log_value("true");
-      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
-      tape.append(0, internal::tape_type::TRUE_VALUE);
-      return continue_state;
-    case 'f':
-      log_value("false");
-      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
-      tape.append(0, internal::tape_type::FALSE_VALUE);
-      return continue_state;
-    case 'n':
-      log_value("null");
-      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
-      tape.append(0, internal::tape_type::NULL_VALUE);
+    case 't': case 'f': case 'n':
+      FAIL_IF( parse_atom() );
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -10334,27 +9776,40 @@ struct structural_parser : structural_iterator {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
-      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
+    // the string might not be NULL terminated.
+    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
     end_document();
-    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
-
     if (depth != 0) {
-      log_error("Unclosed objects or arrays!");
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
     }
 
-    return SUCCESS;
+    return on_success(SUCCESS);
+  }
+
+  really_inline error_code on_error(error_code new_error_code) noexcept {
+    doc_parser.error = new_error_code;
+    return new_error_code;
+  }
+  really_inline error_code on_success(error_code success_code) noexcept {
+    doc_parser.error = success_code;
+    doc_parser.valid = true;
+    return success_code;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by parser.init_stage2(),
+    /* We do not need the next line because this is done by doc_parser.init_stage2(),
     * pessimistically.
-    * parser.is_valid  = false;
+    * doc_parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -10362,12 +9817,12 @@ struct structural_parser : structural_iterator {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= parser.max_depth()) {
-      return parser.error = DEPTH_ERROR;
+    if (depth >= doc_parser.max_depth()) {
+      return on_error(DEPTH_ERROR);
     }
-    switch (current_char()) {
+    switch (structurals.current_char()) {
     case '"':
-      return parser.error = STRING_ERROR;
+      return on_error(STRING_ERROR);
     case '0':
     case '1':
     case '2':
@@ -10379,173 +9834,302 @@ struct structural_parser : structural_iterator {
     case '8':
     case '9':
     case '-':
-      return parser.error = NUMBER_ERROR;
+      return on_error(NUMBER_ERROR);
     case 't':
-      return parser.error = T_ATOM_ERROR;
+      return on_error(T_ATOM_ERROR);
     case 'n':
-      return parser.error = N_ATOM_ERROR;
+      return on_error(N_ATOM_ERROR);
     case 'f':
-      return parser.error = F_ATOM_ERROR;
+      return on_error(F_ATOM_ERROR);
     default:
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
     }
   }
 
   really_inline void init() {
-    log_start();
-    parser.error = UNINITIALIZED;
+    current_string_buf_loc = doc_parser.doc.string_buf.get();
+    doc_parser.current_loc = 0;
+    doc_parser.valid = false;
+    doc_parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
-    // If there are no structurals left, return EMPTY
-    if (at_end(parser.n_structural_indexes)) {
-      return parser.error = EMPTY;
+  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
+    init(); // sets is_valid to false
+    if (len > doc_parser.capacity()) {
+      return CAPACITY;
     }
-
-    init();
+    // Advance to the first character as soon as possible
+    structurals.advance_char();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return parser.error = DEPTH_ERROR;
+      return on_error(DEPTH_ERROR);
     }
     return SUCCESS;
   }
 
-  really_inline void log_value(const char *type) {
-    logger::log_line(*this, "", type, "");
+  really_inline char advance_char() {
+    return structurals.advance_char();
+  }
+};
+
+// Redefine FAIL_IF to use goto since it'll be used inside the function now
+#undef FAIL_IF
+#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+
+} // namespace stage2
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+  stage2::structural_parser parser(buf, len, doc_parser);
+  error_code result = parser.start(len, addresses.finish);
+  if (result) { return result; }
+
+  //
+  // Read first value
+  //
+  switch (parser.structurals.current_char()) {
+  case '{':
+    FAIL_IF( parser.start_object(addresses.finish) );
+    goto object_begin;
+  case '[':
+    FAIL_IF( parser.start_array(addresses.finish) );
+    goto array_begin;
+  case '"':
+    FAIL_IF( parser.parse_string() );
+    goto finish;
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
+    goto finish;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], false);
+      })
+    );
+    goto finish;
+  case '-':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], true);
+      })
+    );
+    goto finish;
+  default:
+    goto error;
+  }
+
+//
+// Object parser states
+//
+object_begin:
+  switch (parser.advance_char()) {
+  case '"': {
+    parser.increment_count();
+    FAIL_IF( parser.parse_string() );
+    goto object_key_state;
+  }
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+object_key_state:
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.advance_char();
+  GOTO( parser.parse_value(addresses, addresses.object_continue) );
+
+object_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    parser.increment_count();
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
+    goto object_key_state;
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+scope_end:
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+
+//
+// Array parser states
+//
+array_begin:
+  if (parser.advance_char() == ']') {
+    parser.end_array();
+    goto scope_end;
   }
+  parser.increment_count();
 
-  static really_inline void log_start() {
-    logger::log_start();
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+
+array_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    parser.increment_count();
+    parser.advance_char();
+    goto main_array_switch;
+  case ']':
+    parser.end_array();
+    goto scope_end;
+  default:
+    goto error;
   }
 
-  really_inline void log_start_value(const char *type) {
-    logger::log_line(*this, "+", type, "");
-    if (logger::LOG_ENABLED) { logger::log_depth++; }
+finish:
+  return parser.finish();
+
+error:
+  return parser.error();
+}
+
+WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+  error_code code = stage1(buf, len, doc_parser, false);
+  if (!code) {
+    code = stage2(buf, len, doc_parser);
   }
+  return code;
+}
+/* end file src/generic/stage2/structural_parser.h */
+/* begin file src/generic/stage2/streaming_structural_parser.h */
+namespace stage2 {
+
+struct streaming_structural_parser: structural_parser {
+  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
 
-  really_inline void log_end_value(const char *type) {
-    if (logger::LOG_ENABLED) { logger::log_depth--; }
-    logger::log_line(*this, "-", type, "");
+  // override to add streaming
+  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
+    init(); // sets is_valid to false
+    // Capacity ain't no thang for streaming, so we don't check it.
+    // Advance to the first character as soon as possible
+    advance_char();
+    // Push the root scope (there is always at least one scope)
+    if (start_document(finish_parser)) {
+      return on_error(DEPTH_ERROR);
+    }
+    return SUCCESS;
   }
 
-  really_inline void log_error(const char *error) {
-    logger::log_line(*this, "", "ERROR", error);
+  // override to add streaming
+  WARN_UNUSED really_inline error_code finish() {
+    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
+    end_document();
+    if (depth != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
+    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
   }
-}; // struct structural_parser
+};
 
-// Redefine FAIL_IF to use goto since it'll be used inside the function now
-#undef FAIL_IF
-#define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
+} // namespace stage2
 
-template<bool STREAMING>
-WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
-  dom_parser.doc = &doc;
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
-  error_code result = parser.start(addresses.finish);
+  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
-
   //
   // Read first value
   //
-  switch (parser.current_char()) {
+  switch (parser.structurals.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
-    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
-    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
-    if (!STREAMING) {
-      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
-        goto error;
-      }
-    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't':
-    parser.log_value("true");
-    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
-    goto finish;
-  case 'f':
-    parser.log_value("false");
-    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
-    goto finish;
-  case 'n':
-    parser.log_value("null");
-    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::NULL_VALUE);
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
-    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
 //
-// Object parser states
+// Object parser parsers
 //
 object_begin:
   switch (parser.advance_char()) {
   case '"': {
-    parser.increment_count();
-    FAIL_IF( parser.parse_string(true) );
-    goto object_key_state;
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("Object does not start with a key");
     goto error;
   }
 
-object_key_state:
-  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
+object_key_parser:
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.increment_count();
+  parser.advance_char();
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
-    parser.increment_count();
-    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
-    FAIL_IF( parser.parse_string(true) );
-    goto object_key_state;
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.parser.ret_address[parser.depth] );
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
 //
-// Array parser states
+// Array parser parsers
 //
 array_begin:
-  if (parser.peek_next_char() == ']') {
-    parser.advance_char();
+  if (parser.advance_char() == ']') {
     parser.end_array();
     goto scope_end;
   }
@@ -10560,208 +10144,31 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
+    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
-    parser.log_error("Missing comma between array values");
     goto error;
   }
 
 finish:
+  next_json = parser.structurals.next_structural_index();
   return parser.finish();
 
 error:
   return parser.error();
 }
-
-} // namespace {}
-} // namespace stage2
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
-  error_code result = stage2::parse_structurals<false>(*this, _doc);
-  if (result) { return result; }
-
-  // If we didn't make it to the end, it's an error
-  if ( next_structural_index != n_structural_indexes ) {
-    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
-    return error = TAPE_ERROR;
-  }
-
-  return SUCCESS;
-}
-
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
-  return stage2::parse_structurals<true>(*this, _doc);
-}
-/* end file src/generic/stage2/tape_writer.h */
-
-WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
-  error_code err = stage1(_buf, _len, false);
-  if (err) { return err; }
-  return stage2(_doc);
-}
+/* end file src/generic/stage2/streaming_structural_parser.h */
 
 } // namespace haswell
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage2/tape_writer.h */
+/* end file src/generic/stage2/streaming_structural_parser.h */
 #endif
 #if SIMDJSON_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/implementation.cpp */
-/* westmere/implementation.h already included: #include "westmere/implementation.h" */
-/* begin file src/westmere/dom_parser_implementation.h */
-#ifndef SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
-#define SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
-
-/* isadetection.h already included: #include "isadetection.h" */
-
-namespace simdjson {
-namespace westmere {
-
-/* begin file src/generic/dom_parser_implementation.h */
-// expectation: sizeof(scope_descriptor) = 64/8.
-struct scope_descriptor {
-  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
-  uint32_t count; // how many elements in the scope
-}; // struct scope_descriptor
-
-#ifdef SIMDJSON_USE_COMPUTED_GOTO
-typedef void* ret_address_t;
-#else
-typedef char ret_address_t;
-#endif
-
-class dom_parser_implementation final : public internal::dom_parser_implementation {
-public:
-  /** Tape location of each open { or [ */
-  std::unique_ptr<scope_descriptor[]> containing_scope{};
-  /** Return address of each open { or [ */
-  std::unique_ptr<ret_address_t[]> ret_address{};
-  /** Buffer passed to stage 1 */
-  const uint8_t *buf{};
-  /** Length passed to stage 1 */
-  size_t len{0};
-  /** Document passed to stage 2 */
-  dom::document *doc{};
-  /** Error code (TODO remove, this is not even used, we just set it so the g++ optimizer doesn't get confused) */
-  error_code error{UNINITIALIZED};
-
-  really_inline dom_parser_implementation();
-  dom_parser_implementation(const dom_parser_implementation &) = delete;
-  dom_parser_implementation & operator=(const dom_parser_implementation &) = delete;
-
-  WARN_UNUSED error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage1(const uint8_t *buf, size_t len, bool partial) noexcept final;
-  WARN_UNUSED error_code check_for_unclosed_array() noexcept;
-  WARN_UNUSED error_code stage2(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code stage2_next(dom::document &doc) noexcept final;
-  WARN_UNUSED error_code set_capacity(size_t capacity) noexcept final;
-  WARN_UNUSED error_code set_max_depth(size_t max_depth) noexcept final;
-};
-
-/* begin file src/generic/stage1/allocate.h */
-namespace stage1 {
-namespace allocate {
-
-//
-// Allocates stage 1 internal state and outputs in the parser
-//
-really_inline error_code set_capacity(internal::dom_parser_implementation &parser, size_t capacity) {
-  size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
-  parser.structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
-  if (!parser.structural_indexes) { return MEMALLOC; }
-  parser.structural_indexes[0] = 0;
-  parser.n_structural_indexes = 0;
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage1
-/* end file src/generic/stage1/allocate.h */
-/* begin file src/generic/stage2/allocate.h */
-namespace stage2 {
-namespace allocate {
-
-//
-// Allocates stage 2 internal state and outputs in the parser
-//
-really_inline error_code set_max_depth(dom_parser_implementation &parser, size_t max_depth) {
-  parser.containing_scope.reset(new (std::nothrow) scope_descriptor[max_depth]);
-  parser.ret_address.reset(new (std::nothrow) ret_address_t[max_depth]);
-
-  if (!parser.ret_address || !parser.containing_scope) {
-    return MEMALLOC;
-  }
-  return SUCCESS;
-}
-
-} // namespace allocate
-} // namespace stage2
-/* end file src/generic/stage2/allocate.h */
-
-really_inline dom_parser_implementation::dom_parser_implementation() {}
-
-// Leaving these here so they can be inlined if so desired
-WARN_UNUSED error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
-  error_code err = stage1::allocate::set_capacity(*this, capacity);
-  if (err) { _capacity = 0; return err; }
-  _capacity = capacity;
-  return SUCCESS;
-}
-
-WARN_UNUSED error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
-  error_code err = stage2::allocate::set_max_depth(*this, max_depth);
-  if (err) { _max_depth = 0; return err; }
-  _max_depth = max_depth;
-  return SUCCESS;
-}
-/* end file src/generic/stage2/allocate.h */
-
-} // namespace westmere
-} // namespace simdjson
-
-#endif // SIMDJSON_WESTMERE_DOM_PARSER_IMPLEMENTATION_H
-/* end file src/generic/stage2/allocate.h */
-
-TARGET_HASWELL
-
-namespace simdjson {
-namespace westmere {
-
-WARN_UNUSED error_code implementation::create_dom_parser_implementation(
-  size_t capacity,
-  size_t max_depth,
-  std::unique_ptr<internal::dom_parser_implementation>& dst
-) const noexcept {
-  dst.reset( new (std::nothrow) dom_parser_implementation() );
-  if (!dst) { return MEMALLOC; }
-  dst->set_capacity(capacity);
-  dst->set_max_depth(max_depth);
-  return SUCCESS;
-}
-
-} // namespace westmere
-} // namespace simdjson
-
-UNTARGET_REGION
-/* end file src/generic/stage2/allocate.h */
-/* begin file src/westmere/dom_parser_implementation.cpp */
-/* westmere/implementation.h already included: #include "westmere/implementation.h" */
-/* westmere/dom_parser_implementation.h already included: #include "westmere/dom_parser_implementation.h" */
-
-//
-// Stage 1
-//
+/* begin file src/westmere/stage1.cpp */
 /* begin file src/westmere/bitmask.h */
 #ifndef SIMDJSON_WESTMERE_BITMASK_H
 #define SIMDJSON_WESTMERE_BITMASK_H
@@ -11332,21 +10739,24 @@ really_inline simd8<bool> must_be_continuation(simd8<uint8_t> prev1, simd8<uint8
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  really_inline size_t block_index();
-  really_inline bool has_full_block() const;
-  really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  really_inline size_t get_remainder(uint8_t *dst) const;
-  really_inline void advance();
+  really_inline buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+  really_inline size_t block_index() { return idx; }
+  really_inline bool has_full_block() const {
+    return idx < lenminusstep;
+  }
+  really_inline const uint8_t *full_block() const {
+    return &buf[idx];
+  }
+  really_inline bool has_remainder() const {
+    return idx < len;
+  }
+  really_inline void get_remainder(uint8_t *tmp_buf) const {
+    memset(tmp_buf, 0x20, STEP_SIZE);
+    memcpy(tmp_buf, buf + idx, len - idx);
+  }
+  really_inline void advance() {
+    idx += STEP_SIZE;
+  }
 private:
   const uint8_t *buf;
   const size_t len;
@@ -11354,18 +10764,6 @@ struct buf_block_reader {
   size_t idx;
 };
 
-constexpr const int TITLE_SIZE = 12;
-
-// Routines to print masks and text for debugging bitmask operations
-UNUSED static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
-
 // Routines to print masks and text for debugging bitmask operations
 UNUSED static char * format_input_text(const simd8x64<uint8_t> in) {
   static char *buf = (char*)malloc(sizeof(simd8x64<uint8_t>) + 1);
@@ -11385,34 +10783,6 @@ UNUSED static char * format_mask(uint64_t mask) {
   buf[64] = '\0';
   return buf;
 }
-
-template<size_t STEP_SIZE>
-really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
-
-template<size_t STEP_SIZE>
-really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
-}
-
-template<size_t STEP_SIZE>
-really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
-}
-
-template<size_t STEP_SIZE>
-really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  memset(dst, 0x20, STEP_SIZE); // memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  memcpy(dst, buf + idx, len - idx);
-  return len - idx;
-}
-
-template<size_t STEP_SIZE>
-really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
-}
 /* end file src/generic/stage1/buf_block_reader.h */
 /* begin file src/generic/stage1/json_string_scanner.h */
 namespace stage1 {
@@ -11712,15 +11082,13 @@ template<size_t STEP_SIZE>
 error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
   buf_block_reader<STEP_SIZE> reader(buf, len);
   json_minifier minifier(dst);
-
-  // Index the first n-1 blocks
   while (reader.has_full_block()) {
     minifier.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  // Index the last (remainder) block, padded with spaces
-  uint8_t block[STEP_SIZE];
-  if (likely(reader.get_remainder(block)) > 0) {
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
     minifier.step<STEP_SIZE>(block, reader);
   }
 
@@ -11733,94 +11101,6 @@ WARN_UNUSED error_code implementation::minify(const uint8_t *buf, size_t len, ui
   return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
 }
 
-/* begin file src/generic/stage1/find_next_document_index.h */
-/**
-  * This algorithm is used to quickly identify the last structural position that
-  * makes up a complete document.
-  *
-  * It does this by going backwards and finding the last *document boundary* (a
-  * place where one value follows another without a comma between them). If the
-  * last document (the characters after the boundary) has an equal number of
-  * start and end brackets, it is considered complete.
-  *
-  * Simply put, we iterate over the structural characters, starting from
-  * the end. We consider that we found the end of a JSON document when the
-  * first element of the pair is NOT one of these characters: '{' '[' ';' ','
-  * and when the second element is NOT one of these characters: '}' '}' ';' ','.
-  *
-  * This simple comparison works most of the time, but it does not cover cases
-  * where the batch's structural indexes contain a perfect amount of documents.
-  * In such a case, we do not have access to the structural index which follows
-  * the last document, therefore, we do not have access to the second element in
-  * the pair, and that means we cannot identify the last document. To fix this
-  * issue, we keep a count of the open and closed curly/square braces we found
-  * while searching for the pair. When we find a pair AND the count of open and
-  * closed curly/square braces is the same, we know that we just passed a
-  * complete document, therefore the last json buffer location is the end of the
-  * batch.
-  */
-really_inline static uint32_t find_next_document_index(dom_parser_implementation &parser) {
-  // TODO don't count separately, just figure out depth
-  auto arr_cnt = 0;
-  auto obj_cnt = 0;
-  for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
-    auto idxb = parser.structural_indexes[i];
-    switch (parser.buf[idxb]) {
-    case ':':
-    case ',':
-      continue;
-    case '}':
-      obj_cnt--;
-      continue;
-    case ']':
-      arr_cnt--;
-      continue;
-    case '{':
-      obj_cnt++;
-      break;
-    case '[':
-      arr_cnt++;
-      break;
-    }
-    auto idxa = parser.structural_indexes[i - 1];
-    switch (parser.buf[idxa]) {
-    case '{':
-    case '[':
-    case ':':
-    case ',':
-      continue;
-    }
-    // Last document is complete, so the next document will appear after!
-    if (!arr_cnt && !obj_cnt) {
-      return parser.n_structural_indexes;
-    }
-    // Last document is incomplete; mark the document at i + 1 as the next one
-    return i;
-  }
-  return 0;
-}
-
-// Skip the last character if it is partial
-really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
-  if (unlikely(len < 3)) {
-    switch (len) {
-      case 2:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
-        return len;
-      case 1:
-        if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-        return len;
-      case 0:
-        return len;
-    }
-  }
-  if (buf[len-1] >= 0b11000000) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (buf[len-2] >= 0b11100000) { return len-2; } // 3- and 4-byte characters with only 1 byte left
-  if (buf[len-3] >= 0b11110000) { return len-3; } // 4-byte characters with only 3 bytes left
-  return len;
-}
-/* end file src/generic/stage1/find_next_document_index.h */
 /* begin file src/generic/stage1/utf8_lookup2_algorithm.h */
 //
 // Detect Unicode errors.
@@ -11871,9 +11151,9 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 //   support values with more than 23 bits (which a 4-byte character supports).
 //
 //   e.g. 11111000 10100000 10000000 10000000 10000000 (U+800000)
-//
+//   
 // Legal utf-8 byte sequences per  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94:
-//
+// 
 //   Code Points        1st       2s       3s       4s
 //  U+0000..U+007F     00..7F
 //  U+0080..U+07FF     C2..DF   80..BF
@@ -11888,7 +11168,6 @@ really_inline static size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
 using namespace simd;
 
 namespace utf8_validation {
-  // For a detailed description of the lookup2 algorithm, see the file HACKING.md under "UTF-8 validation (lookup2)".
 
   //
   // Find special case UTF-8 errors where the character is technically readable (has the right length)
@@ -11933,7 +11212,7 @@ namespace utf8_validation {
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // [0___]____ (ASCII)
-      0, 0, 0, 0,
+      0, 0, 0, 0,                          
       0, 0, 0, 0,
       // [10__]____ (continuation)
       0, 0, 0, 0,
@@ -11964,6 +11243,214 @@ namespace utf8_validation {
     return byte_1_high & byte_1_low & byte_2_high;
   }
 
+  //
+  // Validate the length of multibyte characters (that each multibyte character has the right number
+  // of continuation characters, and that all continuation characters are part of a multibyte
+  // character).
+  //
+  // Algorithm
+  // =========
+  //
+  // This algorithm compares *expected* continuation characters with *actual* continuation bytes,
+  // and emits an error anytime there is a mismatch.
+  //
+  // For example, in the string "𝄞₿֏ab", which has a 4-, 3-, 2- and 1-byte
+  // characters, the file will look like this:
+  //
+  // | Character             | 𝄞  |    |    |    | ₿  |    |    | ֏  |    | a  | b  |
+  // |-----------------------|----|----|----|----|----|----|----|----|----|----|----|
+  // | Character Length      |  4 |    |    |    |  3 |    |    |  2 |    |  1 |  1 |
+  // | Byte                  | F0 | 9D | 84 | 9E | E2 | 82 | BF | D6 | 8F | 61 | 62 |
+  // | is_second_byte        |    |  X |    |    |    |  X |    |    |  X |    |    |
+  // | is_third_byte         |    |    |  X |    |    |    |  X |    |    |    |    |
+  // | is_fourth_byte        |    |    |    |  X |    |    |    |    |    |    |    |
+  // | expected_continuation |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  // | is_continuation       |    |  X |  X |  X |    |  X |  X |    |  X |    |    |
+  //
+  // The errors here are basically (Second Byte OR Third Byte OR Fourth Byte == Continuation):
+  //
+  // - **Extra Continuations:** Any continuation that is not a second, third or fourth byte is not
+  //   part of a valid 2-, 3- or 4-byte character and is thus an error. It could be that it's just
+  //   floating around extra outside of any character, or that there is an illegal 5-byte character,
+  //   or maybe it's at the beginning of the file before any characters have started; but it's an
+  //   error in all these cases.
+  // - **Missing Continuations:** Any second, third or fourth byte that *isn't* a continuation is an error, because that means
+  //   we started a new character before we were finished with the current one.
+  //
+  // Getting the Previous Bytes
+  // --------------------------
+  //
+  // Because we want to know if a byte is the *second* (or third, or fourth) byte of a multibyte
+  // character, we need to "shift the bytes" to find that out. This is what they mean:
+  //
+  // - `is_continuation`: if the current byte is a continuation.
+  // - `is_second_byte`: if 1 byte back is the start of a 2-, 3- or 4-byte character.
+  // - `is_third_byte`: if 2 bytes back is the start of a 3- or 4-byte character.
+  // - `is_fourth_byte`: if 3 bytes back is the start of a 4-byte character.
+  //
+  // We use shuffles to go n bytes back, selecting part of the current `input` and part of the
+  // `prev_input` (search for `.prev<1>`, `.prev<2>`, etc.). These are passed in by the caller
+  // function, because the 1-byte-back data is used by other checks as well.
+  //
+  // Getting the Continuation Mask
+  // -----------------------------
+  //
+  // Once we have the right bytes, we have to get the masks. To do this, we treat UTF-8 bytes as
+  // numbers, using signed `<` and `>` operations to check if they are continuations or leads.
+  // In fact, we treat the numbers as *signed*, partly because it helps us, and partly because
+  // Intel's SIMD presently only offers signed `<` and `>` operations (not unsigned ones).
+  //
+  // In UTF-8, bytes that start with the bits 110, 1110 and 11110 are 2-, 3- and 4-byte "leads,"
+  // respectively, meaning they expect to have 1, 2 and 3 "continuation bytes" after them.
+  // Continuation bytes start with 10, and ASCII (1-byte characters) starts with 0.
+  //
+  // When treated as signed numbers, they look like this:
+  //
+  // | Type         | High Bits  | Binary Range | Signed |
+  // |--------------|------------|--------------|--------|
+  // | ASCII        | `0`        | `01111111`   |   127  |
+  // |              |            | `00000000`   |     0  |
+  // | 4+-Byte Lead | `1111`     | `11111111`   |    -1  |
+  // |              |            | `11110000    |   -16  |
+  // | 3-Byte Lead  | `1110`     | `11101111`   |   -17  |
+  // |              |            | `11100000    |   -32  |
+  // | 2-Byte Lead  | `110`      | `11011111`   |   -33  |
+  // |              |            | `11000000    |   -64  |
+  // | Continuation | `10`       | `10111111`   |   -65  |
+  // |              |            | `10000000    |  -128  |
+  //
+  // This makes it pretty easy to get the continuation mask! It's just a single comparison:
+  //
+  // ```
+  // is_continuation = input < -64`
+  // ```
+  //
+  // We can do something similar for the others, but it takes two comparisons instead of one: "is
+  // the start of a 4-byte character" is `< -32` and `> -65`, for example. And 2+ bytes is `< 0` and
+  // `> -64`. Surely we can do better, they're right next to each other!
+  //
+  // Getting the is_xxx Masks: Shifting the Range
+  // --------------------------------------------
+  //
+  // Notice *why* continuations were a single comparison. The actual *range* would require two
+  // comparisons--`< -64` and `> -129`--but all characters are always greater than -128, so we get
+  // that for free. In fact, if we had *unsigned* comparisons, 2+, 3+ and 4+ comparisons would be
+  // just as easy: 4+ would be `> 239`, 3+ would be `> 223`, and 2+ would be `> 191`.
+  //
+  // Instead, we add 128 to each byte, shifting the range up to make comparison easy. This wraps
+  // ASCII down into the negative, and puts 4+-Byte Lead at the top:
+  //
+  // | Type                 | High Bits  | Binary Range | Signed |
+  // |----------------------|------------|--------------|-------|
+  // | 4+-Byte Lead (+ 127) | `0111`     | `01111111`   |   127 |
+  // |                      |            | `01110000    |   112 |
+  // |----------------------|------------|--------------|-------|
+  // | 3-Byte Lead (+ 127)  | `0110`     | `01101111`   |   111 |
+  // |                      |            | `01100000    |    96 |
+  // |----------------------|------------|--------------|-------|
+  // | 2-Byte Lead (+ 127)  | `010`      | `01011111`   |    95 |
+  // |                      |            | `01000000    |    64 |
+  // |----------------------|------------|--------------|-------|
+  // | Continuation (+ 127) | `00`       | `00111111`   |    63 |
+  // |                      |            | `00000000    |     0 |
+  // |----------------------|------------|--------------|-------|
+  // | ASCII (+ 127)        | `1`        | `11111111`   |    -1 |
+  // |                      |            | `10000000`   |  -128 |
+  // |----------------------|------------|--------------|-------|
+  // 
+  // *Now* we can use signed `>` on all of them:
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev2 = input.prev<2>
+  // prev3 = input.prev<3>
+  // prev1_flipped = input.prev<1>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = input.prev<2>(prev_input) ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = input.prev<3>(prev_input) ^ 0x80; // Same as `+ 128`
+  // is_second_byte = prev1_flipped > 63;  // 2+-byte lead
+  // is_third_byte  = prev2_flipped > 95;  // 3+-byte lead
+  // is_fourth_byte = prev3_flipped > 111; // 4+-byte lead
+  // ```
+  //
+  // NOTE: we use `^ 0x80` instead of `+ 128` in the code, which accomplishes the same thing, and even takes the same number
+  // of cycles as `+`, but on many Intel architectures can be parallelized better (you can do 3
+  // `^`'s at a time on Haswell, but only 2 `+`'s).
+  //
+  // That doesn't look like it saved us any instructions, did it? Well, because we're adding the
+  // same number to all of them, we can save one of those `+ 128` operations by assembling
+  // `prev2_flipped` out of prev 1 and prev 3 instead of assembling it from input and adding 128
+  // to it. One more instruction saved!
+  //
+  // ```
+  // prev1 = input.prev<1>
+  // prev3 = input.prev<3>
+  // prev1_flipped = prev1 ^ 0x80; // Same as `+ 128`
+  // prev3_flipped = prev3 ^ 0x80; // Same as `+ 128`
+  // prev2_flipped = prev1_flipped.concat<2>(prev3_flipped): // <shuffle: take the first 2 bytes from prev1 and the rest from prev3  
+  // ```
+  //
+  // ### Bringing It All Together: Detecting the Errors
+  //
+  // At this point, we have `is_continuation`, `is_first_byte`, `is_second_byte` and `is_third_byte`.
+  // All we have left to do is check if they match!
+  //
+  // ```
+  // return (is_second_byte | is_third_byte | is_fourth_byte) ^ is_continuation;
+  // ```
+  //
+  // But wait--there's more. The above statement is only 3 operations, but they *cannot be done in
+  // parallel*. You have to do 2 `|`'s and then 1 `&`. Haswell, at least, has 3 ports that can do
+  // bitwise operations, and we're only using 1!
+  //
+  // Epilogue: Addition For Booleans
+  // -------------------------------
+  //
+  // There is one big case the above code doesn't explicitly talk about--what if is_second_byte
+  // and is_third_byte are BOTH true? That means there is a 3-byte and 2-byte character right next
+  // to each other (or any combination), and the continuation could be part of either of them!
+  // Our algorithm using `&` and `|` won't detect that the continuation byte is problematic.
+  //
+  // Never fear, though. If that situation occurs, we'll already have detected that the second
+  // leading byte was an error, because it was supposed to be a part of the preceding multibyte
+  // character, but it *wasn't a continuation*.
+  //
+  // We could stop here, but it turns out that we can fix it using `+` and `-` instead of `|` and
+  // `&`, which is both interesting and possibly useful (even though we're not using it here). It
+  // exploits the fact that in SIMD, a *true* value is -1, and a *false* value is 0. So those
+  // comparisons were giving us numbers!
+  //
+  // Given that, if you do `is_second_byte + is_third_byte + is_fourth_byte`, under normal
+  // circumstances you will either get 0 (0 + 0 + 0) or -1 (-1 + 0 + 0, etc.). Thus,
+  // `(is_second_byte + is_third_byte + is_fourth_byte) - is_continuation` will yield 0 only if
+  // *both* or *neither* are 0 (0-0 or -1 - -1). You'll get 1 or -1 if they are different. Because
+  // *any* nonzero value is treated as an error (not just -1), we're just fine here :)
+  //
+  // Further, if *more than one* multibyte character overlaps,
+  // `is_second_byte + is_third_byte + is_fourth_byte` will be -2 or -3! Subtracting `is_continuation`
+  // from *that* is guaranteed to give you a nonzero value (-1, -2 or -3). So it'll always be
+  // considered an error.
+  //
+  // One reason you might want to do this is parallelism. ^ and | are not associative, so
+  // (A | B | C) ^ D will always be three operations in a row: either you do A | B -> | C -> ^ D, or
+  // you do B | C -> | A -> ^ D. But addition and subtraction *are* associative: (A + B + C) - D can
+  // be written as `(A + B) + (C - D)`. This means you can do A + B and C - D at the same time, and
+  // then adds the result together. Same number of operations, but if the processor can run
+  // independent things in parallel (which most can), it runs faster.
+  //
+  // This doesn't help us on Intel, but might help us elsewhere: on Haswell, at least, | and ^ have
+  // a super nice advantage in that more of them can be run at the same time (they can run on 3
+  // ports, while + and - can run on 2)! This means that we can do A | B while we're still doing C,
+  // saving us the cycle we would have earned by using +. Even more, using an instruction with a
+  // wider array of ports can help *other* code run ahead, too, since these instructions can "get
+  // out of the way," running on a port other instructions can't.
+  // 
+  // Epilogue II: One More Trick
+  // ---------------------------
+  //
+  // There's one more relevant trick up our sleeve, it turns out: it turns out on Intel we can "pay
+  // for" the (prev<1> + 128) instruction, because it can be used to save an instruction in
+  // check_special_cases()--but we'll talk about that there :)
+  //
   really_inline simd8<uint8_t> check_multibyte_lengths(simd8<uint8_t> input, simd8<uint8_t> prev_input, simd8<uint8_t> prev1) {
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
@@ -12101,22 +11588,16 @@ class bit_indexer {
 
 class json_structural_indexer {
 public:
-  /**
-   * Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
-   *
-   * @param partial Setting the partial parameter to true allows the find_structural_bits to
-   *   tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
-   *   you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
-   */
   template<size_t STEP_SIZE>
-  static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept;
+  static error_code index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept;
 
 private:
-  really_inline json_structural_indexer(uint32_t *structural_indexes);
+  really_inline json_structural_indexer(uint32_t *structural_indexes)
+  : indexer{structural_indexes} {}
   template<size_t STEP_SIZE>
   really_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
   really_inline void next(simd::simd8x64<uint8_t> in, json_block block, size_t idx);
-  really_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial);
+  really_inline error_code finish(parser &parser, size_t idx, size_t len, bool streaming);
 
   json_scanner scanner{};
   utf8_checker checker{};
@@ -12125,44 +11606,42 @@ class json_structural_indexer {
   uint64_t unescaped_chars_error = 0;
 };
 
-really_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
+really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
+  uint64_t unescaped = in.lteq(0x1F);
+  checker.check_next_input(in);
+  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
+  prev_structurals = block.structural_start();
+  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
+}
 
-//
-// PERF NOTES:
-// We pipe 2 inputs through these stages:
-// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
-//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
-// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
-//    The output of step 1 depends entirely on this information. These functions don't quite use
-//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
-//    at a time. The second input's scans has some dependency on the first ones finishing it, but
-//    they can make a lot of progress before they need that information.
-// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
-//    to finish: utf-8 checks and generating the output from the last iteration.
-//
-// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
-// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
-// workout.
-//
-template<size_t STEP_SIZE>
-error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, bool partial) noexcept {
-  if (unlikely(len > parser.capacity())) { return CAPACITY; }
-  if (partial) { len = trim_partial_utf8(buf, len); }
+really_inline error_code json_structural_indexer::finish(parser &parser, size_t idx, size_t len, bool streaming) {
+  // Write out the final iteration's structurals
+  indexer.write(uint32_t(idx-64), prev_structurals);
 
-  buf_block_reader<STEP_SIZE> reader(buf, len);
-  json_structural_indexer indexer(parser.structural_indexes.get());
+  error_code error = scanner.finish(streaming);
+  if (unlikely(error != SUCCESS)) { return error; }
 
-  // Read all but the last block
-  while (reader.has_full_block()) {
-    indexer.step<STEP_SIZE>(reader.full_block(), reader);
+  if (unescaped_chars_error) {
+    return UNESCAPED_CHARS;
   }
 
-  // Take care of the last block (will always be there unless file is empty)
-  uint8_t block[STEP_SIZE];
-  if (unlikely(reader.get_remainder(block) == 0)) { return EMPTY; }
-  indexer.step<STEP_SIZE>(block, reader);
-
-  return indexer.finish(parser, reader.block_index(), len, partial);
+  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
+  /* a valid JSON file cannot have zero structural indexes - we should have
+   * found something */
+  if (unlikely(parser.n_structural_indexes == 0u)) {
+    return EMPTY;
+  }
+  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
+    return UNEXPECTED_ERROR;
+  }
+  if (len != parser.structural_indexes[parser.n_structural_indexes - 1]) {
+    /* the string might not be NULL terminated, but we add a virtual NULL
+     * ending character. */
+    parser.structural_indexes[parser.n_structural_indexes++] = uint32_t(len);
+  }
+  /* make it safe to dereference one beyond this array */
+  parser.structural_indexes[parser.n_structural_indexes] = 0;
+  return checker.errors();
 }
 
 template<>
@@ -12184,76 +11663,60 @@ really_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_b
   reader.advance();
 }
 
-really_inline void json_structural_indexer::next(simd::simd8x64<uint8_t> in, json_block block, size_t idx) {
-  uint64_t unescaped = in.lteq(0x1F);
-  checker.check_next_input(in);
-  indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
-  prev_structurals = block.structural_start();
-  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
-}
-
-really_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, bool partial) {
-  // Write out the final iteration's structurals
-  indexer.write(uint32_t(idx-64), prev_structurals);
-
-  error_code error = scanner.finish(partial);
-  if (unlikely(error != SUCCESS)) { return error; }
+//
+// Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
+//
+// PERF NOTES:
+// We pipe 2 inputs through these stages:
+// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
+//    2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
+// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
+//    The output of step 1 depends entirely on this information. These functions don't quite use
+//    up enough CPU: the second half of the functions is highly serial, only using 1 execution core
+//    at a time. The second input's scans has some dependency on the first ones finishing it, but
+//    they can make a lot of progress before they need that information.
+// 3. Step 1 doesn't use enough capacity, so we run some extra stuff while we're waiting for that
+//    to finish: utf-8 checks and generating the output from the last iteration.
+//
+// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
+// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
+// workout.
+//
+// Setting the streaming parameter to true allows the find_structural_bits to tolerate unclosed strings.
+// The caller should still ensure that the input is valid UTF-8. If you are processing substrings,
+// you may want to call on a function like trimmed_length_safe_utf8.
+template<size_t STEP_SIZE>
+error_code json_structural_indexer::index(const uint8_t *buf, size_t len, parser &parser, bool streaming) noexcept {
+  if (unlikely(len > parser.capacity())) { return CAPACITY; }
 
-  if (unescaped_chars_error) {
-    return UNESCAPED_CHARS;
+  buf_block_reader<STEP_SIZE> reader(buf, len);
+  json_structural_indexer indexer(parser.structural_indexes.get());
+  while (reader.has_full_block()) {
+    indexer.step<STEP_SIZE>(reader.full_block(), reader);
   }
 
-  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
-  /***
-   * This is related to https://github.com/simdjson/simdjson/issues/906
-   * Basically, we want to make sure that if the parsing continues beyond the last (valid)
-   * structural character, it quickly stops.
-   * Only three structural characters can be repeated without triggering an error in JSON:  [,] and }.
-   * We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
-   * continues, then it must be [,] or }.
-   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
-   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
-   * It can't be a comma, a colon or any simple value. So the only way we could continue is
-   * if the repeated character is [. But if so, the document must start with [. But if the document
-   * starts with [, it should end with ]. If we enforce that rule, then we would get
-   * ][[ which is invalid.
-   **/
-  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
-  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
-  parser.next_structural_index = 0;
-  // a valid JSON file cannot have zero structural indexes - we should have found something
-  if (unlikely(parser.n_structural_indexes == 0u)) {
-    return EMPTY;
-  }
-  if (unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
-    return UNEXPECTED_ERROR;
+  if (likely(reader.has_remainder())) {
+    uint8_t block[STEP_SIZE];
+    reader.get_remainder(block);
+    indexer.step<STEP_SIZE>(block, reader);
   }
-  if (partial) {
-    auto new_structural_indexes = find_next_document_index(parser);
-    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
-      return CAPACITY; // If the buffer is partial but the document is incomplete, it's too big to parse.
-    }
-    parser.n_structural_indexes = new_structural_indexes;
-  }
-  return checker.errors();
+
+  return indexer.finish(parser, reader.block_index(), len, streaming);
 }
 
 } // namespace stage1
 /* end file src/generic/stage1/json_structural_indexer.h */
-WARN_UNUSED error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, bool streaming) noexcept {
-  this->buf = _buf;
-  this->len = _len;
-  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
+WARN_UNUSED error_code implementation::stage1(const uint8_t *buf, size_t len, parser &parser, bool streaming) const noexcept {
+  return westmere::stage1::json_structural_indexer::index<64>(buf, len, parser, streaming);
 }
 
 } // namespace westmere
+
 } // namespace simdjson
 UNTARGET_REGION
-
-//
-// Stage 2
-//
+/* end file src/generic/stage1/json_structural_indexer.h */
+/* begin file src/westmere/stage2.cpp */
+/* westmere/implementation.h already included: #include "westmere/implementation.h" */
 /* begin file src/westmere/stringparsing.h */
 #ifndef SIMDJSON_WESTMERE_STRINGPARSING_H
 #define SIMDJSON_WESTMERE_STRINGPARSING_H
@@ -12667,10 +12130,10 @@ static bool parse_float_strtod(const char *ptr, double *outDouble) {
   // If you consume a large value and you map it to "infinity", you will no
   // longer be able to serialize back a standard-compliant JSON. And there is
   // no realistic application where you might need values so large than they
-  // can't fit in binary64. The maximal value is about  1.7976931348623157 x
+  // can't fit in binary64. The maximal value is about  1.7976931348623157 ×
   // 10^308 It is an unimaginable large number. There will never be any piece of
   // engineering involving as many as 10^308 parts. It is estimated that there
-  // are about 10^80 atoms in the universe.  The estimate for the total number
+  // are about 10^80 atoms in the universe.  The estimate for the total number
   // of electrons is similar. Using a double-precision floating-point value, we
   // can represent easily the number of atoms in the universe. We could  also
   // represent the number of ways you can pick any three individual atoms at
@@ -12690,6 +12153,26 @@ really_inline bool is_integer(char c) {
   // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
 }
 
+// We need to check that the character following a zero is valid. This is
+// probably frequent and it is harder than it looks. We are building all of this
+// just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
+const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
+    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+really_inline bool
+is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
+  return structural_or_whitespace_or_exponent_or_decimal_negated[c];
+}
 
 // check quickly whether the next 8 chars are made of digits
 // at a glance, it looks better than Mula's
@@ -12767,14 +12250,14 @@ never_inline bool parse_large_integer(const uint8_t *const src,
       // as a positive signed integer, but the negative version is
       // possible.
       constexpr int64_t signed_answer = INT64_MIN;
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
     } else {
       // we can negate safely
       int64_t signed_answer = -static_cast<int64_t>(i);
-      writer.append_s64(signed_answer);
+      writer.write_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, src);
 #endif
@@ -12787,12 +12270,12 @@ never_inline bool parse_large_integer(const uint8_t *const src,
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(i, src);
 #endif
-      writer.append_s64(i);
+      writer.write_s64(i);
     } else {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_unsigned_integer(i, src);
 #endif
-      writer.append_u64(i);
+      writer.write_u64(i);
     }
   }
   return is_structural_or_whitespace(*p);
@@ -12802,7 +12285,7 @@ template<typename W>
 bool slow_float_parsing(UNUSED const char * src, W writer) {
   double d;
   if (parse_float_strtod(src, &d)) {
-    writer.append_double(d);
+    writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_float(d, (const uint8_t *)src);
 #endif
@@ -12826,10 +12309,10 @@ bool slow_float_parsing(UNUSED const char * src, W writer) {
 template<typename W>
 really_inline bool parse_number(UNUSED const uint8_t *const src,
                                 UNUSED bool found_minus,
-                                W &writer) {
+                                W writer) {
 #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
                                   // useful to skip parsing
-  writer.append_s64(0);        // always write zero
+  writer.write_s64(0);        // always write zero
   return true;                    // always succeeds
 #else
   const char *p = reinterpret_cast<const char *>(src);
@@ -12849,7 +12332,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
   uint64_t i;      // an unsigned int avoids signed overflows (which are bad)
   if (*p == '0') { // 0 cannot be followed by an integer
     ++p;
-    if (is_integer(*p)) {
+    if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(src);
 #endif
@@ -12973,7 +12456,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       }
       // we over-decrement by one when there is a '.'
       digit_count -= int(start - start_digits);
-      if (unlikely(digit_count >= 19)) {
+      if (digit_count >= 19) {
         // Ok, chances are good that we had an overflow!
         // this is almost never going to get called!!!
         // we start anew, going slowly!!!
@@ -12981,22 +12464,14 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
         // 10000000000000000000000000000000000000000000e+308
         // 3.1415926535897932384626433832795028841971693993751
         //
-        bool success = slow_float_parsing((const char *) src, writer);
-        // The number was already written, but we made a copy of the writer
-        // when we passed it to the parse_large_integer() function, so 
-        writer.skip_double();
-        return success;
+        return slow_float_parsing((const char *) src, writer);
       }
     }
     if (unlikely(exponent < FASTFLOAT_SMALLEST_POWER) ||
         (exponent > FASTFLOAT_LARGEST_POWER)) { // this is uncommon!!!
       // this is almost never going to get called!!!
       // we start anew, going slowly!!!
-      bool success = slow_float_parsing((const char *) src, writer);
-      // The number was already written, but we made a copy of the writer when we passed it to the
-      // slow_float_parsing() function, so we have to skip those tape spots now that we've returned
-      writer.skip_double();
-      return success;
+      return slow_float_parsing((const char *) src, writer);
     }
     bool success = true;
     double d = compute_float_64(exponent, i, negative, &success);
@@ -13005,7 +12480,7 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
       success = parse_float_strtod((const char *)src, &d);
     }
     if (success) {
-      writer.append_double(d);
+      writer.write_double(d);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_float(d, src);
 #endif
@@ -13020,14 +12495,10 @@ really_inline bool parse_number(UNUSED const uint8_t *const src,
     if (unlikely(digit_count >= 18)) { // this is uncommon!!!
       // there is a good chance that we had an overflow, so we need
       // need to recover: we parse the whole thing again.
-      bool success = parse_large_integer(src, writer, found_minus);
-      // The number was already written, but we made a copy of the writer
-      // when we passed it to the parse_large_integer() function, so 
-      writer.skip_large_integer();
-      return success;
+      return parse_large_integer(src, writer, found_minus);
     }
     i = negative ? 0 - i : i;
-    writer.append_s64(i);
+    writer.write_s64(i);
 #ifdef JSON_TEST_NUMBERS // for unit testing
     found_integer(i, src);
 #endif
@@ -13052,72 +12523,6 @@ TARGET_WESTMERE
 namespace simdjson {
 namespace westmere {
 
-/* begin file src/generic/stage2/logger.h */
-// This is for an internal-only stage 2 specific logger.
-// Set LOG_ENABLED = true to log what stage 2 is doing!
-namespace logger {
-  static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
-
-  static constexpr const bool LOG_ENABLED = false;
-  static constexpr const int LOG_EVENT_LEN = 30;
-  static constexpr const int LOG_BUFFER_LEN = 20;
-  static constexpr const int LOG_DETAIL_LEN = 50;
-  static constexpr const int LOG_INDEX_LEN = 10;
-
-  static int log_depth; // Not threadsafe. Log only.
-
-  // Helper to turn unprintable or newline characters into spaces
-  static really_inline char printable_char(char c) {
-    if (c >= 0x20) {
-      return c;
-    } else {
-      return ' ';
-    }
-  }
-
-  // Print the header and set up log_start
-  static really_inline void log_start() {
-    if (LOG_ENABLED) {
-      log_depth = 0;
-      printf("\n");
-      printf("| %-*s | %-*s | %*s | %*s | %*s | %-*s | %-*s | %-*s |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", 4, "Curr", 4, "Next", 5, "Next#", 5, "Tape#", LOG_DETAIL_LEN, "Detail", LOG_INDEX_LEN, "index");
-      printf("|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|%.*s|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, 4+2, DASHES, 4+2, DASHES, 5+2, DASHES, 5+2, DASHES, LOG_DETAIL_LEN+2, DASHES, LOG_INDEX_LEN+2, DASHES);
-    }
-  }
-
-  static really_inline void log_string(const char *message) {
-    if (LOG_ENABLED) {
-      printf("%s\n", message);
-    }
-  }
-
-  // Logs a single line of 
-  template<typename S>
-  static really_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
-    if (LOG_ENABLED) {
-      printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
-      {
-        // Print the next N characters in the buffer.
-        printf("| ");
-        // Otherwise, print the characters starting from the buffer position.
-        // Print spaces for unprintable or newline characters.
-        for (int i=0;i<LOG_BUFFER_LEN;i++) {
-          printf("%c", printable_char(structurals.current()[i]));
-        }
-        printf(" ");
-      }
-      printf("|    %c ", printable_char(structurals.current_char()));
-      printf("|    %c ", printable_char(structurals.peek_next_char()));
-      printf("| %5u ", structurals.parser.structural_indexes[*(structurals.current_structural+1)]);
-      printf("| %5u ", structurals.next_tape_index());
-      printf("| %-*s ", LOG_DETAIL_LEN, detail);
-      printf("| %*u ", LOG_INDEX_LEN, *structurals.current_structural);
-      printf("|\n");
-    }
-  }
-} // namespace logger
-
-/* end file src/generic/stage2/logger.h */
 /* begin file src/generic/stage2/atomparsing.h */
 namespace stage2 {
 namespace atomparsing {
@@ -13176,34 +12581,26 @@ namespace stage2 {
 
 class structural_iterator {
 public:
-  const uint8_t* const buf;
-  uint32_t *current_structural;
-  dom_parser_implementation &parser;
-
-  // Start a structural 
-  really_inline structural_iterator(dom_parser_implementation &_parser, size_t start_structural_index)
-    : buf{_parser.buf},
-      current_structural{&_parser.structural_indexes[start_structural_index]},
-      parser{_parser} {
-  }
-  // Get the buffer position of the current structural character
-  really_inline const uint8_t* current() {
-    return &buf[*current_structural];
+  really_inline structural_iterator(const uint8_t* _buf, size_t _len, const uint32_t *_structural_indexes, size_t next_structural_index)
+    : buf{_buf},
+     len{_len},
+     structural_indexes{_structural_indexes},
+     next_structural{next_structural_index}
+    {}
+  really_inline char advance_char() {
+    idx = structural_indexes[next_structural];
+    next_structural++;
+    c = *current();
+    return c;
   }
-  // Get the current structural character
   really_inline char current_char() {
-    return buf[*current_structural];
+    return c;
   }
-  // Get the next structural character without advancing
-  really_inline char peek_next_char() {
-    return buf[*(current_structural+1)];
-  }
-  really_inline char advance_char() {
-    current_structural++;
-    return buf[*current_structural];
+  really_inline const uint8_t* current() {
+    return &buf[idx];
   }
   really_inline size_t remaining_len() {
-    return parser.len - *current_structural;
+    return len - idx;
   }
   template<typename F>
   really_inline bool with_space_terminated_copy(const F& f) {
@@ -13220,25 +12617,32 @@ class structural_iterator {
     * practice unless you are in the strange scenario where you have many JSON
     * documents made of single atoms.
     */
-    char *copy = static_cast<char *>(malloc(parser.len + SIMDJSON_PADDING));
+    char *copy = static_cast<char *>(malloc(len + SIMDJSON_PADDING));
     if (copy == nullptr) {
       return true;
     }
-    memcpy(copy, buf, parser.len);
-    memset(copy + parser.len, ' ', SIMDJSON_PADDING);
-    bool result = f(reinterpret_cast<const uint8_t*>(copy), *current_structural);
+    memcpy(copy, buf, len);
+    memset(copy + len, ' ', SIMDJSON_PADDING);
+    bool result = f(reinterpret_cast<const uint8_t*>(copy), idx);
     free(copy);
     return result;
   }
   really_inline bool past_end(uint32_t n_structural_indexes) {
-    return current_structural >= &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 > n_structural_indexes;
   }
   really_inline bool at_end(uint32_t n_structural_indexes) {
-    return current_structural == &parser.structural_indexes[n_structural_indexes];
+    return next_structural+1 == n_structural_indexes;
   }
-  really_inline bool at_beginning() {
-    return current_structural == parser.structural_indexes.get();
+  really_inline size_t next_structural_index() {
+    return next_structural;
   }
+
+  const uint8_t* const buf;
+  const size_t len;
+  const uint32_t* const structural_indexes;
+  size_t next_structural; // next structural index
+  size_t idx{0}; // location of the structural character in the input (buf)
+  uint8_t c{0};  // used to track the (structural) character we are looking at
 };
 
 } // namespace stage2
@@ -13250,105 +12654,8 @@ class structural_iterator {
 // "simdjson/stage2.h" (this simplifies amalgation)
 
 namespace stage2 {
-namespace { // Make everything here private
-
-/* begin file src/generic/stage2/tape_writer.h */
-struct tape_writer {
-  /** The next place to write to tape */
-  uint64_t *next_tape_loc;
-  
-  /** Write a signed 64-bit value to tape. */
-  really_inline void append_s64(int64_t value) noexcept;
-
-  /** Write an unsigned 64-bit value to tape. */
-  really_inline void append_u64(uint64_t value) noexcept;
-
-  /** Write a double value to tape. */
-  really_inline void append_double(double value) noexcept;
-
-  /**
-   * Append a tape entry (an 8-bit type,and 56 bits worth of value).
-   */
-  really_inline void append(uint64_t val, internal::tape_type t) noexcept;
-
-  /**
-   * Skip the current tape entry without writing.
-   *
-   * Used to skip the start of the container, since we'll come back later to fill it in when the
-   * container ends.
-   */
-  really_inline void skip() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a large u64 or i64.
-   */
-  really_inline void skip_large_integer() noexcept;
-
-  /**
-   * Skip the number of tape entries necessary to write a double.
-   */
-  really_inline void skip_double() noexcept;
-
-  /**
-   * Write a value to a known location on tape.
-   *
-   * Used to go back and write out the start of a container after the container ends.
-   */
-  really_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
-
-private:
-  /**
-   * Append both the tape entry, and a supplementary value following it. Used for types that need
-   * all 64 bits, such as double and uint64_t.
-   */
-  template<typename T>
-  really_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
-}; // struct number_writer
-
-really_inline void tape_writer::append_s64(int64_t value) noexcept {
-  append2(0, value, internal::tape_type::INT64);
-}
-
-really_inline void tape_writer::append_u64(uint64_t value) noexcept {
-  append(0, internal::tape_type::UINT64);
-  *next_tape_loc = value;
-  next_tape_loc++;
-}
 
-/** Write a double value to tape. */
-really_inline void tape_writer::append_double(double value) noexcept {
-  append2(0, value, internal::tape_type::DOUBLE);
-}
-
-really_inline void tape_writer::skip() noexcept {
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::skip_large_integer() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::skip_double() noexcept {
-  next_tape_loc += 2;
-}
-
-really_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
-  *next_tape_loc = val | ((uint64_t(char(t))) << 56);
-  next_tape_loc++;
-}
-
-template<typename T>
-really_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
-  append(val, t);
-  static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
-  memcpy(next_tape_loc, &val2, sizeof(val2));
-  next_tape_loc++;
-}
-
-really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
-  tape_loc = val | ((uint64_t(char(t))) << 56);
-}
-/* end file src/generic/stage2/tape_writer.h */
+using internal::ret_address;
 
 #ifdef SIMDJSON_USE_COMPUTED_GOTO
 #define INIT_ADDRESSES() { &&array_begin, &&array_continue, &&error, &&finish, &&object_begin, &&object_continue }
@@ -13379,88 +12686,102 @@ really_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal
 #endif // SIMDJSON_USE_COMPUTED_GOTO
 
 struct unified_machine_addresses {
-  ret_address_t array_begin;
-  ret_address_t array_continue;
-  ret_address_t error;
-  ret_address_t finish;
-  ret_address_t object_begin;
-  ret_address_t object_continue;
+  ret_address array_begin;
+  ret_address array_continue;
+  ret_address error;
+  ret_address finish;
+  ret_address object_begin;
+  ret_address object_continue;
 };
 
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { return addresses.error; } }
 
-struct structural_parser : structural_iterator {
-  /** Lets you append to the tape */
-  tape_writer tape;
+struct number_writer {
+  parser &doc_parser;
+  
+  really_inline void write_s64(int64_t value) noexcept {
+    write_tape(0, internal::tape_type::INT64);
+    std::memcpy(&doc_parser.doc.tape[doc_parser.current_loc], &value, sizeof(value));
+    ++doc_parser.current_loc;
+  }
+  really_inline void write_u64(uint64_t value) noexcept {
+    write_tape(0, internal::tape_type::UINT64);
+    doc_parser.doc.tape[doc_parser.current_loc++] = value;
+  }
+  really_inline void write_double(double value) noexcept {
+    write_tape(0, internal::tape_type::DOUBLE);
+    static_assert(sizeof(value) == sizeof(doc_parser.doc.tape[doc_parser.current_loc]), "mismatch size");
+    memcpy(&doc_parser.doc.tape[doc_parser.current_loc++], &value, sizeof(double));
+    // doc.tape[doc.current_loc++] = *((uint64_t *)&d);
+  }
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
+  }
+}; // struct number_writer
+
+struct structural_parser {
+  structural_iterator structurals;
+  parser &doc_parser;
   /** Next write location in the string buf for stage 2 parsing */
-  uint8_t *current_string_buf_loc;
-  /** Current depth (nested objects and arrays) */
-  uint32_t depth{0};
-
-  // For non-streaming, to pass an explicit 0 as next_structural, which enables optimizations
-  really_inline structural_parser(dom_parser_implementation &_parser, uint32_t start_structural_index)
-    : structural_iterator(_parser, start_structural_index),
-      tape{parser.doc->tape.get()},
-      current_string_buf_loc{parser.doc->string_buf.get()} {
-  }
-
-  WARN_UNUSED really_inline bool start_scope(ret_address_t continue_state) {
-    parser.containing_scope[depth].tape_index = next_tape_index();
-    parser.containing_scope[depth].count = 0;
-    tape.skip(); // We don't actually *write* the start element until the end.
-    parser.ret_address[depth] = continue_state;
+  uint8_t *current_string_buf_loc{};
+  uint32_t depth;
+
+  really_inline structural_parser(
+    const uint8_t *buf,
+    size_t len,
+    parser &_doc_parser,
+    uint32_t next_structural = 0
+  ) : structurals(buf, len, _doc_parser.structural_indexes.get(), next_structural), doc_parser{_doc_parser}, depth{0} {}
+
+  WARN_UNUSED really_inline bool start_scope(internal::tape_type type, ret_address continue_state) {
+    doc_parser.containing_scope[depth].tape_index = doc_parser.current_loc;
+    doc_parser.containing_scope[depth].count = 0;
+    write_tape(0, type); // if the document is correct, this gets rewritten later
+    doc_parser.ret_address[depth] = continue_state;
     depth++;
-    bool exceeded_max_depth = depth >= parser.max_depth();
-    if (exceeded_max_depth) { log_error("Exceeded max depth!"); }
-    return exceeded_max_depth;
+    return depth >= doc_parser.max_depth();
   }
 
-  WARN_UNUSED really_inline bool start_document(ret_address_t continue_state) {
-    log_start_value("document");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_document(ret_address continue_state) {
+    return start_scope(internal::tape_type::ROOT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_object(ret_address_t continue_state) {
-    log_start_value("object");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_object(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_OBJECT, continue_state);
   }
 
-  WARN_UNUSED really_inline bool start_array(ret_address_t continue_state) {
-    log_start_value("array");
-    return start_scope(continue_state);
+  WARN_UNUSED really_inline bool start_array(ret_address continue_state) {
+    return start_scope(internal::tape_type::START_ARRAY, continue_state);
   }
 
   // this function is responsible for annotating the start of the scope
-  really_inline void end_scope(internal::tape_type start, internal::tape_type end) noexcept {
+  really_inline void end_scope(internal::tape_type type) noexcept {
     depth--;
-    // write our doc->tape location to the header scope
+    // write our doc.tape location to the header scope
     // The root scope gets written *at* the previous location.
-    tape.append(parser.containing_scope[depth].tape_index, end);
+    write_tape(doc_parser.containing_scope[depth].tape_index, type);
     // count can overflow if it exceeds 24 bits... so we saturate
     // the convention being that a cnt of 0xffffff or more is undetermined in value (>=  0xffffff).
-    const uint32_t start_tape_index = parser.containing_scope[depth].tape_index;
-    const uint32_t count = parser.containing_scope[depth].count;
+    const uint32_t start_tape_index = doc_parser.containing_scope[depth].tape_index;
+    const uint32_t count = doc_parser.containing_scope[depth].count;
     const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
-    // This is a load and an OR. It would be possible to just write once at doc->tape[d.tape_index]
-    tape_writer::write(parser.doc->tape[start_tape_index], next_tape_index() | (uint64_t(cntsat) << 32), start);
-  }
-
-  really_inline uint32_t next_tape_index() {
-    return uint32_t(tape.next_tape_loc - parser.doc->tape.get());
+    // This is a load and an OR. It would be possible to just write once at doc.tape[d.tape_index]
+    doc_parser.doc.tape[start_tape_index] |= doc_parser.current_loc | (uint64_t(cntsat) << 32);
   }
 
   really_inline void end_object() {
-    log_end_value("object");
-    end_scope(internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
+    end_scope(internal::tape_type::END_OBJECT);
   }
   really_inline void end_array() {
-    log_end_value("array");
-    end_scope(internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
+    end_scope(internal::tape_type::END_ARRAY);
   }
   really_inline void end_document() {
-    log_end_value("document");
-    end_scope(internal::tape_type::ROOT, internal::tape_type::ROOT);
+    end_scope(internal::tape_type::ROOT);
+  }
+
+  really_inline void write_tape(uint64_t val, internal::tape_type t) noexcept {
+    doc_parser.doc.tape[doc_parser.current_loc++] = val | ((uint64_t(char(t))) << 56);
   }
 
   // increment_count increments the count of keys in an object or values in an array.
@@ -13468,16 +12789,17 @@ struct structural_parser : structural_iterator {
   // must be increment in the preceding depth (depth-1) where the array or
   // the object resides.
   really_inline void increment_count() {
-    parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
+    doc_parser.containing_scope[depth - 1].count++; // we have a key value pair in the object at parser.depth - 1
   }
 
   really_inline uint8_t *on_start_string() noexcept {
-    // we advance the point, accounting for the fact that we have a NULL termination
-    tape.append(current_string_buf_loc - parser.doc->string_buf.get(), internal::tape_type::STRING);
+    /* we advance the point, accounting for the fact that we have a NULL
+      * termination         */
+    write_tape(current_string_buf_loc - doc_parser.doc.string_buf.get(), internal::tape_type::STRING);
     return current_string_buf_loc + sizeof(uint32_t);
   }
 
-  really_inline void on_end_string(uint8_t *dst) noexcept {
+  really_inline bool on_end_string(uint8_t *dst) noexcept {
     uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
     // TODO check for overflow in case someone has a crazy string (>=4GB?)
     // But only add the overflow check when the document itself exceeds 4GB
@@ -13487,49 +12809,73 @@ struct structural_parser : structural_iterator {
     // be NULL terminated? It comes at a small cost
     *dst = 0;
     current_string_buf_loc = dst + 1;
+    return true;
   }
 
-  WARN_UNUSED really_inline bool parse_string(bool key = false) {
-    log_value(key ? "key" : "string");
+  WARN_UNUSED really_inline bool parse_string() {
     uint8_t *dst = on_start_string();
-    dst = stringparsing::parse_string(current(), dst);
+    dst = stringparsing::parse_string(structurals.current(), dst);
     if (dst == nullptr) {
-      log_error("Invalid escape in string");
       return true;
     }
-    on_end_string(dst);
-    return false;
+    return !on_end_string(dst);
   }
 
   WARN_UNUSED really_inline bool parse_number(const uint8_t *src, bool found_minus) {
-    log_value("number");
-    bool succeeded = numberparsing::parse_number(src, found_minus, tape);
-    if (!succeeded) { log_error("Invalid number"); }
-    return !succeeded;
+    number_writer writer{doc_parser};
+    return !numberparsing::parse_number(src, found_minus, writer);
   }
   WARN_UNUSED really_inline bool parse_number(bool found_minus) {
-    return parse_number(current(), found_minus);
+    return parse_number(structurals.current(), found_minus);
+  }
+
+  WARN_UNUSED really_inline bool parse_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
+  }
+
+  WARN_UNUSED really_inline bool parse_single_atom() {
+    switch (structurals.current_char()) {
+      case 't':
+        if (!atomparsing::is_valid_true_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::TRUE_VALUE);
+        break;
+      case 'f':
+        if (!atomparsing::is_valid_false_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::FALSE_VALUE);
+        break;
+      case 'n':
+        if (!atomparsing::is_valid_null_atom(structurals.current(), structurals.remaining_len())) { return true; }
+        write_tape(0, internal::tape_type::NULL_VALUE);
+        break;
+      default:
+        return true;
+    }
+    return false;
   }
 
-  WARN_UNUSED really_inline ret_address_t parse_value(const unified_machine_addresses &addresses, ret_address_t continue_state) {
-    switch (advance_char()) {
+  WARN_UNUSED really_inline ret_address parse_value(const unified_machine_addresses &addresses, ret_address continue_state) {
+    switch (structurals.current_char()) {
     case '"':
       FAIL_IF( parse_string() );
       return continue_state;
-    case 't':
-      log_value("true");
-      FAIL_IF( !atomparsing::is_valid_true_atom(current()) );
-      tape.append(0, internal::tape_type::TRUE_VALUE);
-      return continue_state;
-    case 'f':
-      log_value("false");
-      FAIL_IF( !atomparsing::is_valid_false_atom(current()) );
-      tape.append(0, internal::tape_type::FALSE_VALUE);
-      return continue_state;
-    case 'n':
-      log_value("null");
-      FAIL_IF( !atomparsing::is_valid_null_atom(current()) );
-      tape.append(0, internal::tape_type::NULL_VALUE);
+    case 't': case 'f': case 'n':
+      FAIL_IF( parse_atom() );
       return continue_state;
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
@@ -13545,27 +12891,40 @@ struct structural_parser : structural_iterator {
       FAIL_IF( start_array(continue_state) );
       return addresses.array_begin;
     default:
-      log_error("Non-value found when value was expected!");
       return addresses.error;
     }
   }
 
   WARN_UNUSED really_inline error_code finish() {
+    // the string might not be NULL terminated.
+    if ( !structurals.at_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
     end_document();
-    parser.next_structural_index = uint32_t(current_structural + 1 - &parser.structural_indexes[0]);
-
     if (depth != 0) {
-      log_error("Unclosed objects or arrays!");
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
     }
 
-    return SUCCESS;
+    return on_success(SUCCESS);
+  }
+
+  really_inline error_code on_error(error_code new_error_code) noexcept {
+    doc_parser.error = new_error_code;
+    return new_error_code;
+  }
+  really_inline error_code on_success(error_code success_code) noexcept {
+    doc_parser.error = success_code;
+    doc_parser.valid = true;
+    return success_code;
   }
 
   WARN_UNUSED really_inline error_code error() {
-    /* We do not need the next line because this is done by parser.init_stage2(),
+    /* We do not need the next line because this is done by doc_parser.init_stage2(),
     * pessimistically.
-    * parser.is_valid  = false;
+    * doc_parser.is_valid  = false;
     * At this point in the code, we have all the time in the world.
     * Note that we know exactly where we are in the document so we could,
     * without any overhead on the processing code, report a specific
@@ -13573,12 +12932,12 @@ struct structural_parser : structural_iterator {
     * We could even trigger special code paths to assess what happened
     * carefully,
     * all without any added cost. */
-    if (depth >= parser.max_depth()) {
-      return parser.error = DEPTH_ERROR;
+    if (depth >= doc_parser.max_depth()) {
+      return on_error(DEPTH_ERROR);
     }
-    switch (current_char()) {
+    switch (structurals.current_char()) {
     case '"':
-      return parser.error = STRING_ERROR;
+      return on_error(STRING_ERROR);
     case '0':
     case '1':
     case '2':
@@ -13590,124 +12949,92 @@ struct structural_parser : structural_iterator {
     case '8':
     case '9':
     case '-':
-      return parser.error = NUMBER_ERROR;
+      return on_error(NUMBER_ERROR);
     case 't':
-      return parser.error = T_ATOM_ERROR;
+      return on_error(T_ATOM_ERROR);
     case 'n':
-      return parser.error = N_ATOM_ERROR;
+      return on_error(N_ATOM_ERROR);
     case 'f':
-      return parser.error = F_ATOM_ERROR;
+      return on_error(F_ATOM_ERROR);
     default:
-      return parser.error = TAPE_ERROR;
+      return on_error(TAPE_ERROR);
     }
   }
 
   really_inline void init() {
-    log_start();
-    parser.error = UNINITIALIZED;
+    current_string_buf_loc = doc_parser.doc.string_buf.get();
+    doc_parser.current_loc = 0;
+    doc_parser.valid = false;
+    doc_parser.error = UNINITIALIZED;
   }
 
-  WARN_UNUSED really_inline error_code start(ret_address_t finish_state) {
-    // If there are no structurals left, return EMPTY
-    if (at_end(parser.n_structural_indexes)) {
-      return parser.error = EMPTY;
+  WARN_UNUSED really_inline error_code start(size_t len, ret_address finish_state) {
+    init(); // sets is_valid to false
+    if (len > doc_parser.capacity()) {
+      return CAPACITY;
     }
-
-    init();
+    // Advance to the first character as soon as possible
+    structurals.advance_char();
     // Push the root scope (there is always at least one scope)
     if (start_document(finish_state)) {
-      return parser.error = DEPTH_ERROR;
+      return on_error(DEPTH_ERROR);
     }
     return SUCCESS;
   }
 
-  really_inline void log_value(const char *type) {
-    logger::log_line(*this, "", type, "");
-  }
-
-  static really_inline void log_start() {
-    logger::log_start();
-  }
-
-  really_inline void log_start_value(const char *type) {
-    logger::log_line(*this, "+", type, "");
-    if (logger::LOG_ENABLED) { logger::log_depth++; }
-  }
-
-  really_inline void log_end_value(const char *type) {
-    if (logger::LOG_ENABLED) { logger::log_depth--; }
-    logger::log_line(*this, "-", type, "");
-  }
-
-  really_inline void log_error(const char *error) {
-    logger::log_line(*this, "", "ERROR", error);
+  really_inline char advance_char() {
+    return structurals.advance_char();
   }
-}; // struct structural_parser
+};
 
 // Redefine FAIL_IF to use goto since it'll be used inside the function now
 #undef FAIL_IF
 #define FAIL_IF(EXPR) { if (EXPR) { goto error; } }
 
-template<bool STREAMING>
-WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_parser, dom::document &doc) noexcept {
-  dom_parser.doc = &doc;
+} // namespace stage2
+
+/************
+ * The JSON is parsed to a tape, see the accompanying tape.md file
+ * for documentation.
+ ***********/
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
   static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
-  stage2::structural_parser parser(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
-  error_code result = parser.start(addresses.finish);
+  stage2::structural_parser parser(buf, len, doc_parser);
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
 
   //
   // Read first value
   //
-  switch (parser.current_char()) {
+  switch (parser.structurals.current_char()) {
   case '{':
     FAIL_IF( parser.start_object(addresses.finish) );
     goto object_begin;
   case '[':
     FAIL_IF( parser.start_array(addresses.finish) );
-    // Make sure the outer array is closed before continuing; otherwise, there are ways we could get
-    // into memory corruption. See https://github.com/simdjson/simdjson/issues/906
-    if (!STREAMING) {
-      if (parser.buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]] != ']') {
-        goto error;
-      }
-    }
     goto array_begin;
   case '"':
     FAIL_IF( parser.parse_string() );
     goto finish;
-  case 't':
-    parser.log_value("true");
-    FAIL_IF( !atomparsing::is_valid_true_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::TRUE_VALUE);
-    goto finish;
-  case 'f':
-    parser.log_value("false");
-    FAIL_IF( !atomparsing::is_valid_false_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::FALSE_VALUE);
-    goto finish;
-  case 'n':
-    parser.log_value("null");
-    FAIL_IF( !atomparsing::is_valid_null_atom(parser.current(), parser.remaining_len()) );
-    parser.tape.append(0, internal::tape_type::NULL_VALUE);
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
     goto finish;
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], false);
       })
     );
     goto finish;
   case '-':
     FAIL_IF(
-      parser.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
         return parser.parse_number(&copy[idx], true);
       })
     );
     goto finish;
   default:
-    parser.log_error("Document starts with a non-value character");
     goto error;
   }
 
@@ -13718,45 +13045,43 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case '"': {
     parser.increment_count();
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   }
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("Object does not start with a key");
     goto error;
   }
 
 object_key_state:
-  if (parser.advance_char() != ':' ) { parser.log_error("Missing colon after key in object"); goto error; }
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.advance_char();
   GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
 object_continue:
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
-    if (parser.advance_char() != '"' ) { parser.log_error("Key string missing at beginning of field in object"); goto error; }
-    FAIL_IF( parser.parse_string(true) );
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
     goto object_key_state;
   case '}':
     parser.end_object();
     goto scope_end;
   default:
-    parser.log_error("No comma between object fields");
     goto error;
   }
 
 scope_end:
-  CONTINUE( parser.parser.ret_address[parser.depth] );
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
 
 //
 // Array parser states
 //
 array_begin:
-  if (parser.peek_next_char() == ']') {
-    parser.advance_char();
+  if (parser.advance_char() == ']') {
     parser.end_array();
     goto scope_end;
   }
@@ -13771,12 +13096,12 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   switch (parser.advance_char()) {
   case ',':
     parser.increment_count();
+    parser.advance_char();
     goto main_array_switch;
   case ']':
     parser.end_array();
     goto scope_end;
   default:
-    parser.log_error("Missing comma between array values");
     goto error;
   }
 
@@ -13787,46 +13112,176 @@ WARN_UNUSED static error_code parse_structurals(dom_parser_implementation &dom_p
   return parser.error();
 }
 
-} // namespace {}
+WARN_UNUSED error_code implementation::parse(const uint8_t *buf, size_t len, parser &doc_parser) const noexcept {
+  error_code code = stage1(buf, len, doc_parser, false);
+  if (!code) {
+    code = stage2(buf, len, doc_parser);
+  }
+  return code;
+}
+/* end file src/generic/stage2/structural_parser.h */
+/* begin file src/generic/stage2/streaming_structural_parser.h */
+namespace stage2 {
+
+struct streaming_structural_parser: structural_parser {
+  really_inline streaming_structural_parser(const uint8_t *buf, size_t len, parser &_doc_parser, uint32_t next_structural) : structural_parser(buf, len, _doc_parser, next_structural) {}
+
+  // override to add streaming
+  WARN_UNUSED really_inline error_code start(UNUSED size_t len, ret_address finish_parser) {
+    init(); // sets is_valid to false
+    // Capacity ain't no thang for streaming, so we don't check it.
+    // Advance to the first character as soon as possible
+    advance_char();
+    // Push the root scope (there is always at least one scope)
+    if (start_document(finish_parser)) {
+      return on_error(DEPTH_ERROR);
+    }
+    return SUCCESS;
+  }
+
+  // override to add streaming
+  WARN_UNUSED really_inline error_code finish() {
+    if ( structurals.past_end(doc_parser.n_structural_indexes) ) {
+      return on_error(TAPE_ERROR);
+    }
+    end_document();
+    if (depth != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    if (doc_parser.containing_scope[depth].tape_index != 0) {
+      return on_error(TAPE_ERROR);
+    }
+    bool finished = structurals.at_end(doc_parser.n_structural_indexes);
+    return on_success(finished ? SUCCESS : SUCCESS_AND_HAS_MORE);
+  }
+};
+
 } // namespace stage2
 
 /************
  * The JSON is parsed to a tape, see the accompanying tape.md file
  * for documentation.
  ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
-  error_code result = stage2::parse_structurals<false>(*this, _doc);
+WARN_UNUSED error_code implementation::stage2(const uint8_t *buf, size_t len, parser &doc_parser, size_t &next_json) const noexcept {
+  static constexpr stage2::unified_machine_addresses addresses = INIT_ADDRESSES();
+  stage2::streaming_structural_parser parser(buf, len, doc_parser, uint32_t(next_json));
+  error_code result = parser.start(len, addresses.finish);
   if (result) { return result; }
+  //
+  // Read first value
+  //
+  switch (parser.structurals.current_char()) {
+  case '{':
+    FAIL_IF( parser.start_object(addresses.finish) );
+    goto object_begin;
+  case '[':
+    FAIL_IF( parser.start_array(addresses.finish) );
+    goto array_begin;
+  case '"':
+    FAIL_IF( parser.parse_string() );
+    goto finish;
+  case 't': case 'f': case 'n':
+    FAIL_IF( parser.parse_single_atom() );
+    goto finish;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], false);
+      })
+    );
+    goto finish;
+  case '-':
+    FAIL_IF(
+      parser.structurals.with_space_terminated_copy([&](const uint8_t *copy, size_t idx) {
+        return parser.parse_number(&copy[idx], true);
+      })
+    );
+    goto finish;
+  default:
+    goto error;
+  }
 
-  // If we didn't make it to the end, it's an error
-  if ( next_structural_index != n_structural_indexes ) {
-    logger::log_string("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
-    return error = TAPE_ERROR;
+//
+// Object parser parsers
+//
+object_begin:
+  switch (parser.advance_char()) {
+  case '"': {
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  }
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
   }
 
-  return SUCCESS;
-}
+object_key_parser:
+  FAIL_IF( parser.advance_char() != ':' );
+  parser.increment_count();
+  parser.advance_char();
+  GOTO( parser.parse_value(addresses, addresses.object_continue) );
 
-/************
- * The JSON is parsed to a tape, see the accompanying tape.md file
- * for documentation.
- ***********/
-WARN_UNUSED error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
-  return stage2::parse_structurals<true>(*this, _doc);
-}
-/* end file src/generic/stage2/tape_writer.h */
+object_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    FAIL_IF( parser.advance_char() != '"' );
+    FAIL_IF( parser.parse_string() );
+    goto object_key_parser;
+  case '}':
+    parser.end_object();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+scope_end:
+  CONTINUE( parser.doc_parser.ret_address[parser.depth] );
+
+//
+// Array parser parsers
+//
+array_begin:
+  if (parser.advance_char() == ']') {
+    parser.end_array();
+    goto scope_end;
+  }
+  parser.increment_count();
+
+main_array_switch:
+  /* we call update char on all paths in, so we can peek at parser.c on the
+   * on paths that can accept a close square brace (post-, and at start) */
+  GOTO( parser.parse_value(addresses, addresses.array_continue) );
+
+array_continue:
+  switch (parser.advance_char()) {
+  case ',':
+    parser.increment_count();
+    parser.advance_char();
+    goto main_array_switch;
+  case ']':
+    parser.end_array();
+    goto scope_end;
+  default:
+    goto error;
+  }
+
+finish:
+  next_json = parser.structurals.next_structural_index();
+  return parser.finish();
 
-WARN_UNUSED error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
-  error_code err = stage1(_buf, _len, false);
-  if (err) { return err; }
-  return stage2(_doc);
+error:
+  return parser.error();
 }
+/* end file src/generic/stage2/streaming_structural_parser.h */
 
 } // namespace westmere
 } // namespace simdjson
 UNTARGET_REGION
-/* end file src/generic/stage2/tape_writer.h */
+/* end file src/generic/stage2/streaming_structural_parser.h */
 #endif
 
 SIMDJSON_POP_DISABLE_WARNINGS
-/* end file src/generic/stage2/tape_writer.h */
+/* end file src/generic/stage2/streaming_structural_parser.h */
diff --git a/inst/include/simdjson.h b/inst/include/simdjson.h
index 21efa8e..0a1d140 100644
--- a/inst/include/simdjson.h
+++ b/inst/include/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on Fri 12 Jun 2020 13:09:36 EDT. Do not edit! */
+/* auto-generated on Wed May 20 10:23:07 EDT 2020. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
@@ -2030,6 +2030,7 @@ namespace simdjson {
  */
 enum error_code {
   SUCCESS = 0,              ///< No error
+  SUCCESS_AND_HAS_MORE,     ///< @private No error and buffer still has more data
   CAPACITY,                 ///< This parser can't support a document that big
   MEMALLOC,                 ///< Error allocating memory, most likely out of memory
   TAPE_ERROR,               ///< Something went wrong while writing to the tape (stage 2), this is a generic error
@@ -2408,187 +2409,6 @@ inline char *allocate_padded_buffer(size_t length) noexcept;
 #ifndef SIMDJSON_IMPLEMENTATION_H
 #define SIMDJSON_IMPLEMENTATION_H
 
-/* begin file include/simdjson/internal/dom_parser_implementation.h */
-#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
-#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
-
-#include <memory>
-
-namespace simdjson {
-
-namespace dom {
-class document;
-} // namespace dom
-
-namespace internal {
-
-/**
- * An implementation of simdjson's DOM parser for a particular CPU architecture.
- *
- * This class is expected to be accessed only by pointer, and never move in memory (though the
- * pointer can move).
- */
-class dom_parser_implementation {
-public:
-
-  /**
-   * @private For internal implementation use
-   *
-   * Run a full JSON parse on a single document (stage1 + stage2).
-   * 
-   * Guaranteed only to be called when capacity > document length.
-   *
-   * Overridden by each implementation.
-   *
-   * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
-   * @param len The length of the json document.
-   * @return The error code, or SUCCESS if there was no error.
-   */
-  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0;
-
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 1 of the document parser.
-   * 
-   * Guaranteed only to be called when capacity > document length.
-   *
-   * Overridden by each implementation.
-   *
-   * @param buf The json document to parse.
-   * @param len The length of the json document.
-   * @param streaming Whether this is being called by parser::parse_many.
-   * @return The error code, or SUCCESS if there was no error.
-   */
-  WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, bool streaming) noexcept = 0;
-
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 2 of the document parser.
-   * 
-   * Called after stage1().
-   *
-   * Overridden by each implementation.
-   *
-   * @param doc The document to output to.
-   * @return The error code, or SUCCESS if there was no error.
-   */
-  WARN_UNUSED virtual error_code stage2(dom::document &doc) noexcept = 0;
-
-  /**
-   * @private For internal implementation use
-   *
-   * Stage 2 of the document parser for parser::parse_many.
-   *
-   * Guaranteed only to be called after stage1().
-   * Overridden by each implementation.
-   *
-   * @param doc The document to output to.
-   * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed.
-   */
-  WARN_UNUSED virtual error_code stage2_next(dom::document &doc) noexcept = 0;
-
-  /**
-   * Change the capacity of this parser.
-   * 
-   * Generally used for reallocation.
-   *
-   * @param capacity The new capacity.
-   * @param max_depth The new max_depth.
-   * @return The error code, or SUCCESS if there was no error.
-   */
-  virtual error_code set_capacity(size_t capacity) noexcept = 0;
-
-  /**
-   * Change the max depth of this parser.
-   *
-   * Generally used for reallocation.
-   *
-   * @param capacity The new capacity.
-   * @param max_depth The new max_depth.
-   * @return The error code, or SUCCESS if there was no error.
-   */
-  virtual error_code set_max_depth(size_t max_depth) noexcept = 0;
-
-  /**
-   * Deallocate this parser.
-   */
-  virtual ~dom_parser_implementation() = default;
-
-  /** Number of structural indices passed from stage 1 to stage 2 */
-  uint32_t n_structural_indexes{0};
-  /** Structural indices passed from stage 1 to stage 2 */
-  std::unique_ptr<uint32_t[]> structural_indexes{};
-  /** Next structural index to parse */
-  uint32_t next_structural_index{0};
-
-  /**
-   * The largest document this parser can support without reallocating.
-   *
-   * @return Current capacity, in bytes.
-   */
-  really_inline size_t capacity() const noexcept;
-
-  /**
-   * The maximum level of nested object and arrays supported by this parser.
-   *
-   * @return Maximum depth, in bytes.
-   */
-  really_inline size_t max_depth() const noexcept;
-
-  /**
-   * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
-   * and `max_depth` depth.
-   *
-   * @param capacity The new capacity.
-   * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
-   * @return The error, if there is one.
-   */
-  WARN_UNUSED inline error_code allocate(size_t capacity, size_t max_depth) noexcept;
-
-protected:
-  /**
-   * The maximum document length this parser supports.
-   *
-   * Buffers are large enough to handle any document up to this length.
-   */
-  size_t _capacity{0};
-
-  /**
-   * The maximum depth (number of nested objects and arrays) supported by this parser.
-   *
-   * Defaults to DEFAULT_MAX_DEPTH.
-   */
-  size_t _max_depth{0};
-}; // class dom_parser_implementation
-
-really_inline size_t dom_parser_implementation::capacity() const noexcept {
-  return _capacity;
-}
-
-really_inline size_t dom_parser_implementation::max_depth() const noexcept {
-  return _max_depth;
-}
-
-WARN_UNUSED
-inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept {
-  if (this->max_depth() != max_depth) {
-    error_code err = set_max_depth(max_depth);
-    if (err) { return err; }
-  }
-  if (_capacity != capacity) {
-    error_code err = set_capacity(capacity);
-    if (err) { return err; }
-  }
-  return SUCCESS;
-}
-
-} // namespace internal
-} // namespace simdjson
-
-#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
-/* end file include/simdjson/internal/dom_parser_implementation.h */
 #include <optional>
 #include <string>
 #include <atomic>
@@ -2597,8 +2417,8 @@ inline error_code dom_parser_implementation::allocate(size_t capacity, size_t ma
 namespace simdjson {
 
 namespace dom {
-  class document;
-} // namespace dom
+  class parser;
+}
 
 /**
  * An implementation of simdjson for a particular CPU architecture.
@@ -2641,19 +2461,16 @@ class implementation {
   /**
    * @private For internal implementation use
    *
-   *     const implementation *impl = simdjson::active_implementation;
-   *     cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
+   * Run a full document parse (ensure_capacity, stage1 and stage2).
    *
-   * @param capacity The largest document that will be passed to the parser.
-   * @param max_depth The maximum JSON object/array nesting this parser is expected to handle.
-   * @param dst The place to put the resulting parser implementation.
-   * @return the name of the implementation, e.g. "haswell", "westmere", "arm64"
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
+   * @return the error code, or SUCCESS if there was no error.
    */
-  virtual error_code create_dom_parser_implementation(
-    size_t capacity,
-    size_t max_depth,
-    std::unique_ptr<internal::dom_parser_implementation> &dst
-  ) const noexcept = 0;
+  WARN_UNUSED virtual error_code parse(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0;
 
   /**
    * @private For internal implementation use
@@ -2670,6 +2487,50 @@ class implementation {
    */
   WARN_UNUSED virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
 
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 1 of the document parser.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
+   * @param streaming whether this is being called by parser::parse_many.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code stage1(const uint8_t *buf, size_t len, dom::parser &parser, bool streaming) const noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
+   * @return the error code, or SUCCESS if there was no error.
+   */
+  WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser) const noexcept = 0;
+
+  /**
+   * @private For internal implementation use
+   *
+   * Stage 2 of the document parser for parser::parse_many.
+   *
+   * Overridden by each implementation.
+   *
+   * @param buf the json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
+   * @param len the length of the json document.
+   * @param parser the parser with the buffers to use. *MUST* have allocated up to at least len capacity.
+   * @param next_json the next structural index. Start this at 0 the first time, and it will be updated to the next value to pass each time.
+   * @return the error code, SUCCESS if there was no error, or SUCCESS_AND_HAS_MORE if there was no error and stage2 can be called again.
+   */
+  WARN_UNUSED virtual error_code stage2(const uint8_t *buf, size_t len, dom::parser &parser, size_t &next_json) const noexcept = 0;
+
 protected:
   /** @private Construct an implementation with the given name and description. For subclasses. */
   really_inline implementation(
@@ -2787,7 +2648,7 @@ extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> activ
 } // namespace simdjson
 
 #endif // SIMDJSON_IMPLEMENTATION_H
-/* end file include/simdjson/internal/dom_parser_implementation.h */
+/* end file include/simdjson/implementation.h */
 /* begin file include/simdjson/dom/array.h */
 #ifndef SIMDJSON_DOM_ARRAY_H
 #define SIMDJSON_DOM_ARRAY_H
@@ -3161,6 +3022,22 @@ class document {
 
 namespace simdjson {
 
+namespace internal {
+
+// expectation: sizeof(scope_descriptor) = 64/8.
+struct scope_descriptor {
+  uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
+  uint32_t count; // how many elements in the scope
+}; // struct scope_descriptor
+
+#ifdef SIMDJSON_USE_COMPUTED_GOTO
+typedef void* ret_address;
+#else
+typedef char ret_address;
+#endif
+
+} // namespace internal
+
 namespace dom {
 
 class document_stream;
@@ -3198,14 +3075,14 @@ class parser {
    *
    * @param other The parser to take. Its capacity is zeroed.
    */
-  really_inline parser(parser &&other) noexcept;
+  parser(parser &&other) = default;
   parser(const parser &) = delete; ///< @private Disallow copying
   /**
    * Take another parser's buffers and state.
    *
    * @param other The parser to take. Its capacity is zeroed.
    */
-  really_inline parser &operator=(parser &&other) noexcept;
+  parser &operator=(parser &&other) = default;
   parser &operator=(const parser &) = delete; ///< @private Disallow copying
 
   /** Deallocate the JSON parser. */
@@ -3465,8 +3342,7 @@ class parser {
   /**
    * Set max_capacity. This is the largest document this parser can automatically support.
    *
-   * The parser may reallocate internal buffers as needed up to this amount as documents are passed
-   * to it.
+   * The parser may reallocate internal buffers as needed up to this amount.
    *
    * This call will not allocate or deallocate, even if capacity is currently above max_capacity.
    *
@@ -3479,8 +3355,19 @@ class parser {
   /** @private Use simdjson_error instead */
   using InvalidJSON [[deprecated("Use simdjson_error instead")]] = simdjson_error;
 
-  /** @private [for benchmarking access] The implementation to use */
-  std::unique_ptr<internal::dom_parser_implementation> implementation{};
+  /** @private Next location to write to in the tape */
+  uint32_t current_loc{0};
+
+  /** @private Number of structural indices passed from stage 1 to stage 2 */
+  uint32_t n_structural_indexes{0};
+  /** @private Structural indices passed from stage 1 to stage 2 */
+  std::unique_ptr<uint32_t[]> structural_indexes{};
+
+  /** @private Tape location of each open { or [ */
+  std::unique_ptr<internal::scope_descriptor[]> containing_scope{};
+
+  /** @private Return address of each open { or [ */
+  std::unique_ptr<internal::ret_address[]> ret_address{};
 
   /** @private Use `if (parser.parse(...).error())` instead */
   bool valid{false};
@@ -3520,6 +3407,20 @@ class parser {
    */
   size_t _max_capacity;
 
+  /**
+   * The maximum document length this parser supports.
+   *
+   * Buffers are large enough to handle any document up to this length.
+   */
+  size_t _capacity{0};
+
+  /**
+   * The maximum depth (number of nested objects and arrays) supported by this parser.
+   *
+   * Defaults to DEFAULT_MAX_DEPTH.
+   */
+  size_t _max_depth{0};
+
   /**
    * The loaded buffer (reused each time load() is called)
    */
@@ -3599,7 +3500,7 @@ class document_stream {
     really_inline bool operator!=(const iterator &other) const noexcept;
 
   private:
-    really_inline iterator(document_stream &s, bool finished) noexcept;
+    iterator(document_stream& stream, bool finished) noexcept;
     /** The document_stream we're iterating through. */
     document_stream& stream;
     /** Whether we're finished or not. */
@@ -3622,23 +3523,7 @@ class document_stream {
 
   document_stream(document_stream &other) = delete;    // Disallow copying
 
-  /**
-   * Construct a document_stream. Does not allocate or parse anything until the iterator is
-   * used.
-   */
-  really_inline document_stream(
-    dom::parser &parser,
-    const uint8_t *buf,
-    size_t len,
-    size_t batch_size,
-    error_code error = SUCCESS
-  ) noexcept;
-
-  /**
-   * Parse the first document in the buffer. Used by begin(), to handle allocation and
-   * initialization.
-   */
-  inline void start() noexcept;
+  really_inline document_stream(dom::parser &parser, const uint8_t *buf, size_t len, size_t batch_size, error_code error = SUCCESS) noexcept;
 
   /**
    * Parse the next document found in the buffer previously given to document_stream.
@@ -3651,7 +3536,10 @@ class document_stream {
    * pre-allocating a capacity defined by the batch_size defined when creating the
    * document_stream object.
    *
-   * The function returns simdjson::EMPTY if there is no more data to be parsed.
+   * The function returns simdjson::SUCCESS_AND_HAS_MORE (an integer = 1) in case
+   * of success and indicates that the buffer still contains more data to be parsed,
+   * meaning this function can be called again to return the next JSON document
+   * after this one.
    *
    * The function returns simdjson::SUCCESS (as integer = 0) in case of success
    * and indicates that the buffer has successfully been parsed to the end.
@@ -3662,51 +3550,55 @@ class document_stream {
    * the simdjson::error_message function converts these error codes into a string).
    *
    * You can also check validity by calling parser.is_valid(). The same parser can
-   * and should be reused for the other documents in the buffer.
-   */
-  inline void next() noexcept;
+   * and should be reused for the other documents in the buffer. */
+  inline error_code json_parse() noexcept;
 
   /**
-   * Pass the next batch through stage 1 and return when finished.
-   * When threads are enabled, this may wait for the stage 1 thread to finish.
+   * Returns the location (index) of where the next document should be in the
+   * buffer.
+   * Can be used for debugging, it tells the user the position of the end of the
+   * last
+   * valid JSON document parsed
    */
-  inline void load_batch() noexcept;
+  inline size_t get_current_buffer_loc() const { return current_buffer_loc; }
 
-  /** Get the next document index. */
-  inline size_t next_batch_start() const noexcept;
-
-  /** Pass the next batch through stage 1 with the given parser. */
-  inline error_code run_stage1(dom::parser &p, size_t batch_start) noexcept;
-
-  dom::parser &parser;
-  const uint8_t *buf;
-  const size_t len;
-  const size_t batch_size;
-  size_t batch_start{0};
-  /** The error (or lack thereof) from the current document. */
-  error_code error;
-
-#ifdef SIMDJSON_THREADS_ENABLED
-  inline void load_from_stage1_thread() noexcept;
+  /**
+   * Returns the total amount of complete documents parsed by the document_stream,
+   * in the current buffer, at the given time.
+   */
+  inline size_t get_n_parsed_docs() const { return n_parsed_docs; }
 
-  /** Start a thread to run stage 1 on the next batch. */
-  inline void start_stage1_thread() noexcept;
+  /**
+   * Returns the total amount of data (in bytes) parsed by the document_stream,
+   * in the current buffer, at the given time.
+   */
+  inline size_t get_n_bytes_parsed() const { return n_bytes_parsed; }
 
-  /** Wait for the stage 1 thread to finish and capture the results. */
-  inline void finish_stage1_thread() noexcept;
+  inline const uint8_t *buf() const { return _buf + buf_start; }
 
-  /** The error returned from the stage 1 thread. */
-  error_code stage1_thread_error{UNINITIALIZED};
-  /** The thread used to run stage 1 against the next batch in the background. */
-  std::thread stage1_thread{};
+  inline void advance(size_t offset) { buf_start += offset; }
 
-  /**
-   * The parser used to run stage 1 in the background. Will be swapped
-   * with the regular parser when finished.
-   */
-  dom::parser stage1_thread_parser{};
-#endif // SIMDJSON_THREADS_ENABLED
+  inline size_t remaining() const { return _len - buf_start; }
 
+  dom::parser &parser;
+  const uint8_t *_buf;
+  const size_t _len;
+  size_t _batch_size; // this is actually variable!
+  size_t buf_start{0};
+  size_t next_json{0};
+  bool load_next_batch{true};
+  size_t current_buffer_loc{0};
+#ifdef SIMDJSON_THREADS_ENABLED
+  size_t last_json_buffer_loc{0};
+#endif
+  size_t n_parsed_docs{0};
+  size_t n_bytes_parsed{0};
+  error_code error{SUCCESS_AND_HAS_MORE};
+#ifdef SIMDJSON_THREADS_ENABLED
+  error_code stage1_is_ok_thread{SUCCESS};
+  std::thread stage_1_thread{};
+  dom::parser parser_thread{};
+#endif
   friend class dom::parser;
 }; // class document_stream
 
@@ -4951,36 +4843,124 @@ inline std::ostream& operator<<(std::ostream& out, const simdjson_result<dom::ar
 #include <stdexcept>
 
 namespace simdjson {
-namespace dom {
+namespace internal {
+
+/**
+ * This algorithm is used to quickly identify the buffer position of
+ * the last JSON document inside the current batch.
+ *
+ * It does its work by finding the last pair of structural characters
+ * that represent the end followed by the start of a document.
+ *
+ * Simply put, we iterate over the structural characters, starting from
+ * the end. We consider that we found the end of a JSON document when the
+ * first element of the pair is NOT one of these characters: '{' '[' ';' ','
+ * and when the second element is NOT one of these characters: '}' '}' ';' ','.
+ *
+ * This simple comparison works most of the time, but it does not cover cases
+ * where the batch's structural indexes contain a perfect amount of documents.
+ * In such a case, we do not have access to the structural index which follows
+ * the last document, therefore, we do not have access to the second element in
+ * the pair, and means that we cannot identify the last document. To fix this
+ * issue, we keep a count of the open and closed curly/square braces we found
+ * while searching for the pair. When we find a pair AND the count of open and
+ * closed curly/square braces is the same, we know that we just passed a
+ * complete
+ * document, therefore the last json buffer location is the end of the batch
+ * */
+inline uint32_t find_last_json_buf_idx(const uint8_t *buf, size_t size, const dom::parser &parser) {
+  // this function can be generally useful
+  if (parser.n_structural_indexes == 0)
+    return 0;
+  auto last_i = parser.n_structural_indexes - 1;
+  if (parser.structural_indexes[last_i] == size) {
+    if (last_i == 0)
+      return 0;
+    last_i = parser.n_structural_indexes - 2;
+  }
+  auto arr_cnt = 0;
+  auto obj_cnt = 0;
+  for (auto i = last_i; i > 0; i--) {
+    auto idxb = parser.structural_indexes[i];
+    switch (buf[idxb]) {
+    case ':':
+    case ',':
+      continue;
+    case '}':
+      obj_cnt--;
+      continue;
+    case ']':
+      arr_cnt--;
+      continue;
+    case '{':
+      obj_cnt++;
+      break;
+    case '[':
+      arr_cnt++;
+      break;
+    }
+    auto idxa = parser.structural_indexes[i - 1];
+    switch (buf[idxa]) {
+    case '{':
+    case '[':
+    case ':':
+    case ',':
+      continue;
+    }
+    if (!arr_cnt && !obj_cnt) {
+      return last_i + 1;
+    }
+    return i;
+  }
+  return 0;
+}
 
+// returns true if the provided byte value is an ASCII character
+static inline bool is_ascii(char c) {
+  return ((unsigned char)c) <= 127;
+}
+
+// if the string ends with  UTF-8 values, backtrack
+// up to the first ASCII character. May return 0.
+static inline size_t trimmed_length_safe_utf8(const char * c, size_t len) {
+  while ((len > 0) and (not is_ascii(c[len - 1]))) {
+    len--;
+  }
+  return len;
+}
+
+} // namespace internal
+
+} // namespace simdjson
+
+namespace simdjson {
+namespace dom {
 really_inline document_stream::document_stream(
   dom::parser &_parser,
-  const uint8_t *_buf,
-  size_t _len,
-  size_t _batch_size,
+  const uint8_t *buf,
+  size_t len,
+  size_t batch_size,
   error_code _error
 ) noexcept
   : parser{_parser},
-    buf{_buf},
-    len{_len},
-    batch_size{_batch_size},
-    error{_error}
+   _buf{buf},
+   _len{len},
+   _batch_size(batch_size),
+   error(_error)
 {
+  if (!error) { error = json_parse(); }
 }
 
 inline document_stream::~document_stream() noexcept {
 #ifdef SIMDJSON_THREADS_ENABLED
-  // TODO kill the thread, why should people have to wait for a non-side-effecting operation to complete
-  if (stage1_thread.joinable()) {
-    stage1_thread.join();
+  if (stage_1_thread.joinable()) {
+    stage_1_thread.join();
   }
 #endif
 }
 
 really_inline document_stream::iterator document_stream::begin() noexcept {
-  start();
-  // If there are no documents, we're finished.
-  return iterator(*this, error == EMPTY);
+  return iterator(*this, false);
 }
 
 really_inline document_stream::iterator document_stream::end() noexcept {
@@ -4992,15 +4972,17 @@ really_inline document_stream::iterator::iterator(document_stream& _stream, bool
 }
 
 really_inline simdjson_result<element> document_stream::iterator::operator*() noexcept {
-  // Once we have yielded any errors, we're finished.
-  if (stream.error) { finished = true; return stream.error; }
+  error_code err = stream.error == SUCCESS_AND_HAS_MORE ? SUCCESS : stream.error;
+  if (err) { return err; }
   return stream.parser.doc.root();
 }
 
 really_inline document_stream::iterator& document_stream::iterator::operator++() noexcept {
-  stream.next();
-  // If that was the last document, we're finished.
-  if (stream.error == EMPTY) { finished = true; }
+  if (stream.error == SUCCESS_AND_HAS_MORE) {
+    stream.error = stream.json_parse();
+  } else {
+    finished = true;
+  }
   return *this;
 }
 
@@ -5008,96 +4990,130 @@ really_inline bool document_stream::iterator::operator!=(const document_stream::
   return finished != other.finished;
 }
 
-inline void document_stream::start() noexcept {
-  if (error) { return; }
-
-  error = parser.ensure_capacity(batch_size);
-  if (error) { return; }
-
-  // Always run the first stage 1 parse immediately
-  batch_start = 0;
-  error = run_stage1(parser, batch_start);
-  if (error) { return; }
-
 #ifdef SIMDJSON_THREADS_ENABLED
-  if (next_batch_start() < len) {
-    // Kick off the first thread if needed
-    error = stage1_thread_parser.ensure_capacity(batch_size);
-    if (error) { return; }
-    start_stage1_thread();
-    if (error) { return; }
-  }
-#endif // SIMDJSON_THREADS_ENABLED
-
-  next();
-}
-
-inline void document_stream::next() noexcept {
-  if (error) { return; }
-
-  // Load the next document from the batch
-  error = parser.implementation->stage2_next(parser.doc);
-
-  // If that was the last document in the batch, load another batch (if available)
-  while (error == EMPTY) {
-    batch_start = next_batch_start();
-    if (batch_start >= len) { break; }
-
-#ifdef SIMDJSON_THREADS_ENABLED
-    load_from_stage1_thread();
-#else
-    error = run_stage1(parser, batch_start);
-#endif
-    if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
-
-    // Run stage 2 on the first document in the batch
-    error = parser.implementation->stage2_next(parser.doc);
-  }
-}
 
-inline size_t document_stream::next_batch_start() const noexcept {
-  return batch_start + parser.implementation->structural_indexes[parser.implementation->n_structural_indexes];
-}
-
-inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
-  // If this is the final batch, pass partial = false
-  size_t remaining = len - _batch_start;
-  if (remaining <= batch_size) {
-    return p.implementation->stage1(&buf[_batch_start], remaining, false);
-  } else {
-    return p.implementation->stage1(&buf[_batch_start], batch_size, true);
+// threaded version of json_parse
+// todo: simplify this code further
+inline error_code document_stream::json_parse() noexcept {
+  error = parser.ensure_capacity(_batch_size);
+  if (error) { return error; }
+  error = parser_thread.ensure_capacity(_batch_size);
+  if (error) { return error; }
+
+  if (unlikely(load_next_batch)) {
+    // First time loading
+    if (!stage_1_thread.joinable()) {
+      _batch_size = (std::min)(_batch_size, remaining());
+      _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
+      if (_batch_size == 0) {
+        return simdjson::UTF8_ERROR;
+      }
+      auto stage1_is_ok = error_code(simdjson::active_implementation->stage1(buf(), _batch_size, parser, true));
+      if (stage1_is_ok != simdjson::SUCCESS) {
+        return stage1_is_ok;
+      }
+      uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
+      if (last_index == 0) {
+        if (parser.n_structural_indexes == 0) {
+          return simdjson::EMPTY;
+        }
+      } else {
+        parser.n_structural_indexes = last_index + 1;
+      }
+    }
+    // the second thread is running or done.
+    else {
+      stage_1_thread.join();
+      if (stage1_is_ok_thread != simdjson::SUCCESS) {
+        return stage1_is_ok_thread;
+      }
+      std::swap(parser.structural_indexes, parser_thread.structural_indexes);
+      parser.n_structural_indexes = parser_thread.n_structural_indexes;
+      advance(last_json_buffer_loc);
+      n_bytes_parsed += last_json_buffer_loc;
+    }
+    // let us decide whether we will start a new thread
+    if (remaining() - _batch_size > 0) {
+      last_json_buffer_loc =
+          parser.structural_indexes[internal::find_last_json_buf_idx(buf(), _batch_size, parser)];
+      _batch_size = (std::min)(_batch_size, remaining() - last_json_buffer_loc);
+      if (_batch_size > 0) {
+        _batch_size = internal::trimmed_length_safe_utf8(
+            (const char *)(buf() + last_json_buffer_loc), _batch_size);
+        if (_batch_size == 0) {
+          return simdjson::UTF8_ERROR;
+        }
+        // let us capture read-only variables
+        const uint8_t *const b = buf() + last_json_buffer_loc;
+        const size_t bs = _batch_size;
+        // we call the thread on a lambda that will update
+        // this->stage1_is_ok_thread
+        // there is only one thread that may write to this value
+        stage_1_thread = std::thread([this, b, bs] {
+          this->stage1_is_ok_thread = error_code(simdjson::active_implementation->stage1(b, bs, this->parser_thread, true));
+        });
+      }
+    }
+    next_json = 0;
+    load_next_batch = false;
+  } // load_next_batch
+  error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
+  if (res == simdjson::SUCCESS_AND_HAS_MORE) {
+    n_parsed_docs++;
+    current_buffer_loc = parser.structural_indexes[next_json];
+    load_next_batch = (current_buffer_loc == last_json_buffer_loc);
+  } else if (res == simdjson::SUCCESS) {
+    n_parsed_docs++;
+    if (remaining() > _batch_size) {
+      current_buffer_loc = parser.structural_indexes[next_json - 1];
+      load_next_batch = true;
+      res = simdjson::SUCCESS_AND_HAS_MORE;
+    }
   }
+  return res;
 }
 
-#ifdef SIMDJSON_THREADS_ENABLED
-
-inline void document_stream::load_from_stage1_thread() noexcept {
-  stage1_thread.join();
+#else  // SIMDJSON_THREADS_ENABLED
 
-  // Swap to the parser that was loaded up in the thread. Make sure the parser has
-  // enough memory to swap to, as well.
-  std::swap(parser, stage1_thread_parser);
-  error = stage1_thread_error;
-  if (error) { return; }
+// single-threaded version of json_parse
+inline error_code document_stream::json_parse() noexcept {
+  error = parser.ensure_capacity(_batch_size);
+  if (error) { return error; }
 
-  // If there's anything left, start the stage 1 thread!
-  if (next_batch_start() < len) {
-    start_stage1_thread();
+  if (unlikely(load_next_batch)) {
+    advance(current_buffer_loc);
+    n_bytes_parsed += current_buffer_loc;
+    _batch_size = (std::min)(_batch_size, remaining());
+    _batch_size = internal::trimmed_length_safe_utf8((const char *)buf(), _batch_size);
+    auto stage1_is_ok = (error_code)simdjson::active_implementation->stage1(buf(), _batch_size, parser, true);
+    if (stage1_is_ok != simdjson::SUCCESS) {
+      return stage1_is_ok;
+    }
+    uint32_t last_index = internal::find_last_json_buf_idx(buf(), _batch_size, parser);
+    if (last_index == 0) {
+      if (parser.n_structural_indexes == 0) {
+        return EMPTY;
+      }
+    } else {
+      parser.n_structural_indexes = last_index + 1;
+    }
+    load_next_batch = false;
+  } // load_next_batch
+  error_code res = simdjson::active_implementation->stage2(buf(), remaining(), parser, next_json);
+  if (likely(res == simdjson::SUCCESS_AND_HAS_MORE)) {
+    n_parsed_docs++;
+    current_buffer_loc = parser.structural_indexes[next_json];
+  } else if (res == simdjson::SUCCESS) {
+    n_parsed_docs++;
+    if (remaining() > _batch_size) {
+      current_buffer_loc = parser.structural_indexes[next_json - 1];
+      next_json = 1;
+      load_next_batch = true;
+      res = simdjson::SUCCESS_AND_HAS_MORE;
+    }
   }
+  return res;
 }
-
-inline void document_stream::start_stage1_thread() noexcept {
-  // we call the thread on a lambda that will update
-  // this->stage1_thread_error
-  // there is only one thread that may write to this value
-  // TODO this is NOT exception-safe.
-  this->stage1_thread_error = UNINITIALIZED; // In case something goes wrong, make sure it's an error
-  size_t _next_batch_start = this->next_batch_start();
-  stage1_thread = std::thread([this, _next_batch_start] {
-    this->stage1_thread_error = run_stage1(this->stage1_thread_parser, _next_batch_start);
-  });
-}
-
 #endif // SIMDJSON_THREADS_ENABLED
 
 } // namespace dom
@@ -5136,7 +5152,7 @@ inline error_code document::allocate(size_t capacity) noexcept {
   // worse with "[7,7,7,7,6,7,7,7,6,7,7,6,[7,7,7,7,6,7,7,7,6,7,7,6,7,7,7,7,7,7,6"
   //where len + 1 tape elements are
   // generated, see issue https://github.com/lemire/simdjson/issues/345
-  size_t tape_capacity = ROUNDUP_N(capacity + 3, 64);
+  size_t tape_capacity = ROUNDUP_N(capacity + 2, 64);
   // a document with only zero-length strings... could have len/3 string
   // and we would need len/3 * 5 bytes on the string buffer
   size_t string_capacity = ROUNDUP_N(5 * capacity / 3 + 32, 64);
@@ -6725,11 +6741,8 @@ namespace dom {
 //
 really_inline parser::parser(size_t max_capacity) noexcept
   : _max_capacity{max_capacity},
-    loaded_bytes(nullptr, &aligned_free_char) {
-}
-really_inline parser::parser(parser &&other) noexcept = default;
-really_inline parser &parser::operator=(parser &&other) noexcept = default;
-
+    loaded_bytes(nullptr, &aligned_free_char)
+    {}
 inline bool parser::is_valid() const noexcept { return valid; }
 inline int parser::get_error_code() const noexcept { return error; }
 inline std::string parser::get_error_message() const noexcept { return error_message(error); }
@@ -6812,12 +6825,15 @@ inline simdjson_result<element> parser::parse(const uint8_t *buf, size_t len, bo
     memcpy((void *)buf, tmp_buf, len);
   }
 
-  code = implementation->parse(buf, len, doc);
+  code = simdjson::active_implementation->parse(buf, len, *this);
   if (realloc_if_needed) {
     aligned_free((void *)buf); // must free before we exit
   }
   if (code) { return code; }
 
+  // We're indicating validity via the simdjson_result<element>, so set the parse state back to invalid
+  valid = false;
+  error = UNINITIALIZED;
   return doc.root();
 }
 really_inline simdjson_result<element> parser::parse(const char *buf, size_t len, bool realloc_if_needed) & noexcept {
@@ -6844,30 +6860,81 @@ inline document_stream parser::parse_many(const padded_string &s, size_t batch_s
 }
 
 really_inline size_t parser::capacity() const noexcept {
-  return implementation ? implementation->capacity() : 0;
+  return _capacity;
 }
 really_inline size_t parser::max_capacity() const noexcept {
   return _max_capacity;
 }
 really_inline size_t parser::max_depth() const noexcept {
-  return implementation ? implementation->max_depth() : DEFAULT_MAX_DEPTH;
+  return _max_depth;
 }
 
 WARN_UNUSED
 inline error_code parser::allocate(size_t capacity, size_t max_depth) noexcept {
   //
-  // Reallocate implementation and document if needed
+  // If capacity has changed, reallocate capacity-based buffers
   //
-  error_code err;
-  if (implementation) {
-    err = implementation->allocate(capacity, max_depth);
-  } else {
-    err = simdjson::active_implementation->create_dom_parser_implementation(capacity, max_depth, implementation);
+  if (_capacity != capacity) {
+    // Set capacity to 0 until we finish, in case there's an error
+    _capacity = 0;
+
+    //
+    // Reallocate the document
+    //
+    error_code err = doc.allocate(capacity);
+    if (err) { return err; }
+
+    //
+    // Don't allocate 0 bytes, just return.
+    //
+    if (capacity == 0) {
+      structural_indexes.reset();
+      return SUCCESS;
+    }
+
+    //
+    // Initialize stage 1 output
+    //
+    size_t max_structures = ROUNDUP_N(capacity, 64) + 2 + 7;
+    structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] ); // TODO realloc
+    if (!structural_indexes) {
+      return MEMALLOC;
+    }
+
+    _capacity = capacity;
+
+  //
+  // If capacity hasn't changed, but the document was taken, allocate a new document.
+  //
+  } else if (!doc.tape) {
+    error_code err = doc.allocate(capacity);
+    if (err) { return err; }
   }
-  if (err) { return err; }
 
-  if (implementation->capacity() != capacity || !doc.tape) {
-    return doc.allocate(capacity);
+  //
+  // If max_depth has changed, reallocate those buffers
+  //
+  if (max_depth != _max_depth) {
+    _max_depth = 0;
+
+    if (max_depth == 0) {
+      ret_address.reset();
+      containing_scope.reset();
+      return SUCCESS;
+    }
+
+    //
+    // Initialize stage 2 state
+    //
+    containing_scope.reset(new (std::nothrow) internal::scope_descriptor[max_depth]); // TODO realloc
+    ret_address.reset(new (std::nothrow) internal::ret_address[max_depth]);
+
+    if (!ret_address || !containing_scope) {
+      // Could not allocate memory
+      return MEMALLOC;
+    }
+
+    _max_depth = max_depth;
   }
   return SUCCESS;
 }
@@ -6877,24 +6944,24 @@ inline bool parser::allocate_capacity(size_t capacity, size_t max_depth) noexcep
   return !allocate(capacity, max_depth);
 }
 
+really_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
+  _max_capacity = max_capacity;
+}
+
 inline error_code parser::ensure_capacity(size_t desired_capacity) noexcept {
   // If we don't have enough capacity, (try to) automatically bump it.
   // If the document was taken, reallocate that too.
   // Both in one if statement to minimize unlikely branching.
-  if (unlikely(capacity() < desired_capacity || !doc.tape)) {
+  if (unlikely(desired_capacity > capacity() || !doc.tape)) {
     if (desired_capacity > max_capacity()) {
       return error = CAPACITY;
     }
-    return allocate(desired_capacity, max_depth());
+    return allocate(desired_capacity, _max_depth > 0 ? _max_depth : DEFAULT_MAX_DEPTH);
   }
 
   return SUCCESS;
 }
 
-really_inline void parser::set_max_capacity(size_t max_capacity) noexcept {
-  _max_capacity = max_capacity;
-}
-
 } // namespace dom
 } // namespace simdjson
 

From b79304d460f9b84de6e2a14be646cc2931089b14 Mon Sep 17 00:00:00 2001
From: Brendan Knapp <brendan.g.knapp@gmail.com>
Date: Tue, 16 Jun 2020 17:48:00 -0700
Subject: [PATCH 16/16] add vectorized versions of .deserialize_json() and
 .load_json(), document, add tests, and rebuild

---
 R/RcppExports.R                     |  62 ++++++-
 inst/tinytest/test_load_json.R      |  14 ++
 inst/tinytest/test_vectorized_ops.R |  82 +++++++++
 man/dot-deserialize_json.Rd         |  71 +++++++-
 src/RcppExports.cpp                 |   8 +-
 src/deserialize.cpp                 | 263 ++++++++++++++++++++++++----
 6 files changed, 448 insertions(+), 52 deletions(-)
 create mode 100644 inst/tinytest/test_load_json.R
 create mode 100644 inst/tinytest/test_vectorized_ops.R

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 75710c2..4bfbfa4 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -3,7 +3,7 @@
 
 #' Deserialize JSON into R Objects
 #'
-#' @param json \code{character(1L)}
+#' @param json \code{character()} containing one or more strings of JSON data.
 #'
 #' @param json_pointer \code{character(1L)}, default: \code{""}
 #'
@@ -11,21 +11,69 @@
 #'
 #' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects.
 #'
-#' @param simplify_to default: \code{0}. Maximum simplification level.
-#'   0=dataframe, 1=matrix, 2=vector, 3=list
+#' @param simplify_to \code{integer(1L)}, default: \code{0L}.
+#'     Maximum simplification level.
+#'     0: data frame, 1: matrix, 2: vector, 3: list (no simplification)
 #'
-#' @param type_policy default: \code{0}. Level of type strictness.
-#'   0=anything goes, 1=merge integers/doubles, 2=strict
+#' @param type_policy \code{integer(1L)}, default: \code{0L}.
+#'     Level of type strictness.
+#'     0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)
 #'
-#' @param int64_r_type default: \code{0} How to return big integers to R.
-#'   0=double, 1=string, 2=bit64::integer64
+#' @param int64_r_type \code{integer(1L)} default: \code{0L}
+#'     How to return big integers to R.
+#'     0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number
+#'
+#' @details
+#' Instead of using \code{lapply()} for vectors containing multiple strings/file paths,
+#'     just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized
+#'     (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can
+#'     reuse internal buffers between parses. Since the overwhelming majority of JSON objects
+#'     parsed will not result in R scalars, a \code{list()} is always returned when multiple items
+#'     are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with
+#'     \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will
+#'     have the same names.
 #'
 #' @keywords internal
 #'
+#' @examples
+#' # .deserialize_json() ======================================================
+#' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]')
+#'
+#' RcppSimdJson:::.deserialize_json(
+#'   '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+#' )
+#'
+#' RcppSimdJson:::.deserialize_json(
+#'   c(
+#'     json1 = "[[1,2,3],[4,5,6]]",
+#'     json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+#'   )
+#' )
+#'
 .deserialize_json <- function(json, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) {
     .Call(`_RcppSimdJson_deserialize_json`, json, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
 }
 
+#' @rdname dot-deserialize_json
+#'
+#' @param file_path \code{character()} containing one or more paths to files containing
+#'    JSON data.
+#'
+#' @examples
+#' # .load_json() =============================================================
+#' single_file <- system.file("jsonexamples", "small", "flatadversarial.json",
+#'                            package = "RcppSimdJson")
+#' RcppSimdJson:::.load_json(single_file)
+#'
+#' multiple_files <- vapply(
+#'   c("flatadversarial.json", "adversarial.json"),
+#'   function(.x) {
+#'     system.file("jsonexamples/small", .x, package = "RcppSimdJson")
+#'   },
+#'   character(1L)
+#' )
+#' RcppSimdJson:::.load_json(multiple_files)
+#'
 .load_json <- function(file_path, json_pointer = "", empty_array = NULL, empty_object = NULL, simplify_to = 0L, type_policy = 0L, int64_r_type = 0L) {
     .Call(`_RcppSimdJson_load_json`, file_path, json_pointer, empty_array, empty_object, simplify_to, type_policy, int64_r_type)
 }
diff --git a/inst/tinytest/test_load_json.R b/inst/tinytest/test_load_json.R
new file mode 100644
index 0000000..f0262f2
--- /dev/null
+++ b/inst/tinytest/test_load_json.R
@@ -0,0 +1,14 @@
+if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset")
+
+all_files <- dir("../jsonexamples", pattern = "\\.json$", 
+                 recursive = TRUE, full.names = TRUE)
+
+sapply(all_files, function(.x) expect_silent(RcppSimdJson:::.load_json(.x)))
+
+expect_error(
+  RcppSimdJson:::.load_json("../jsonexamples/amazon_cellphones.ndjson")
+)
+
+expect_error(
+  RcppSimdJson:::.load_json("not/a/real/file.json")
+)
diff --git a/inst/tinytest/test_vectorized_ops.R b/inst/tinytest/test_vectorized_ops.R
new file mode 100644
index 0000000..1199a35
--- /dev/null
+++ b/inst/tinytest/test_vectorized_ops.R
@@ -0,0 +1,82 @@
+if (RcppSimdJson:::.unsupportedArchitecture()) exit_file("Unsupported chipset")
+
+# .deserialize_json ============================================================
+test <- c(
+  first = '{"A":[[1,2,3],[4,5,6]]}',
+  second = '{"B":[{"a":1,"b":true},{"a":2,"b":false,"c":null}]}'
+)
+
+target <- list(
+  first = list(
+    A = matrix(
+      c(
+        1L, 2L, 3L,
+        4L, 5L, 6L
+      ),
+      nrow = 2L, ncol = 3L, byrow = TRUE
+    )
+  ),
+  second = list(
+    B = data.frame(
+      a = c(1L, 2L),
+      b = c(TRUE, FALSE),
+      c = c(NA, NA)
+    )
+  )
+)
+
+expect_identical(
+  RcppSimdJson:::.deserialize_json(test),
+  target
+)
+
+# confirm errors work ----------------------------------------------------------
+test <- c(
+  first = '{"A":[[1,2,3],[4,5,6]]}',
+  bad_json = '{"B":[{"a":1,"b":JUNK},{"a":2,"b":false,"c":null}]}'
+)
+expect_error(
+  RcppSimdJson:::.deserialize_json(test)
+)
+# .load_json() =================================================================
+test <- c(
+  flatadversarial.json = "../jsonexamples/small/flatadversarial.json",
+  adversarial.json = "../jsonexamples/small/adversarial.json"
+)
+
+if (!all(file.exists(test))) {
+  exit_file(
+    "flatadversarial.json and/or adversarial.json are missing."
+  )
+}
+
+target <- list(
+  flatadversarial.json = list(
+    `"Name` = c("116", "\\\"", "234", "true", "FALSE"), t = 1e+10
+  ),
+  adversarial.json = list(
+    `"Name rue` = structure(
+      c("116", "\"", "234", "true", "FALSE"),
+      .Dim = c(1L, 5L)
+    )
+  )
+)
+
+expect_identical(
+  RcppSimdJson:::.load_json(test),
+  target
+)
+
+# all files battery ------------------------------------------------------------
+all_files <- dir("inst/jsonexamples", pattern = "\\.json$", 
+                 recursive = TRUE, full.names = TRUE)
+expect_silent(
+  RcppSimdJson:::.load_json(all_files)
+)
+# confirm errors work ----------------------------------------------------------
+expect_error(
+  RcppSimdJson:::.load_json(c("a/fake/file.json", all_files))
+)
+expect_error(
+  RcppSimdJson:::.load_json(c(all_files, "another/fake/file.json"))
+)
diff --git a/man/dot-deserialize_json.Rd b/man/dot-deserialize_json.Rd
index 72cfeed..f7ef4ac 100644
--- a/man/dot-deserialize_json.Rd
+++ b/man/dot-deserialize_json.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/RcppExports.R
 \name{.deserialize_json}
 \alias{.deserialize_json}
+\alias{.load_json}
 \title{Deserialize JSON into R Objects}
 \usage{
 .deserialize_json(
@@ -13,9 +14,19 @@
   type_policy = 0L,
   int64_r_type = 0L
 )
+
+.load_json(
+  file_path,
+  json_pointer = "",
+  empty_array = NULL,
+  empty_object = NULL,
+  simplify_to = 0L,
+  type_policy = 0L,
+  int64_r_type = 0L
+)
 }
 \arguments{
-\item{json}{\code{character(1L)}}
+\item{json}{\code{character()} containing one or more strings of JSON data.}
 
 \item{json_pointer}{\code{character(1L)}, default: \code{""}}
 
@@ -23,16 +34,62 @@
 
 \item{empty_object}{default: \code{NULL}. Any R object to return for empty JSON objects.}
 
-\item{simplify_to}{default: \code{0}. Maximum simplification level.
-0=dataframe, 1=matrix, 2=vector, 3=list}
+\item{simplify_to}{\code{integer(1L)}, default: \code{0L}.
+Maximum simplification level.
+0: data frame, 1: matrix, 2: vector, 3: list (no simplification)}
 
-\item{type_policy}{default: \code{0}. Level of type strictness.
-0=anything goes, 1=merge integers/doubles, 2=strict}
+\item{type_policy}{\code{integer(1L)}, default: \code{0L}.
+Level of type strictness.
+0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)}
 
-\item{int64_r_type}{default: \code{0} How to return big integers to R.
-0=double, 1=string, 2=bit64::integer64}
+\item{int64_r_type}{\code{integer(1L)} default: \code{0L}
+How to return big integers to R.
+0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number}
+
+\item{file_path}{\code{character()} containing one or more paths to files containing
+JSON data.}
 }
 \description{
 Deserialize JSON into R Objects
+}
+\details{
+Instead of using \code{lapply()} for vectors containing multiple strings/file paths,
+    just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized
+    (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can
+    reuse internal buffers between parses. Since the overwhelming majority of JSON objects
+    parsed will not result in R scalars, a \code{list()} is always returned when multiple items
+    are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with
+    \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will
+    have the same names.
+}
+\examples{
+# .deserialize_json() ======================================================
+RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]')
+
+RcppSimdJson:::.deserialize_json(
+  '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+)
+
+RcppSimdJson:::.deserialize_json(
+  c(
+    json1 = "[[1,2,3],[4,5,6]]",
+    json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+  )
+)
+
+# .load_json() =============================================================
+single_file <- system.file("jsonexamples", "small", "flatadversarial.json",
+                           package = "RcppSimdJson")
+RcppSimdJson:::.load_json(single_file)
+
+multiple_files <- vapply(
+  c("flatadversarial.json", "adversarial.json"),
+  function(.x) {
+    system.file("jsonexamples/small", .x, package = "RcppSimdJson")
+  },
+  character(1L)
+)
+RcppSimdJson:::.load_json(multiple_files)
+
 }
 \keyword{internal}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index a149941..770e97e 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,12 +6,12 @@
 using namespace Rcpp;
 
 // deserialize_json
-SEXP deserialize_json(const Rcpp::String& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type);
+SEXP deserialize_json(const Rcpp::CharacterVector& json, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type);
 RcppExport SEXP _RcppSimdJson_deserialize_json(SEXP jsonSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const Rcpp::String& >::type json(jsonSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type json(jsonSEXP);
     Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP);
     Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP);
     Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP);
@@ -23,12 +23,12 @@ BEGIN_RCPP
 END_RCPP
 }
 // load_json
-SEXP load_json(const std::string& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type);
+SEXP load_json(const Rcpp::CharacterVector& file_path, const std::string& json_pointer, SEXP empty_array, SEXP empty_object, const int simplify_to, const int type_policy, const int int64_r_type);
 RcppExport SEXP _RcppSimdJson_load_json(SEXP file_pathSEXP, SEXP json_pointerSEXP, SEXP empty_arraySEXP, SEXP empty_objectSEXP, SEXP simplify_toSEXP, SEXP type_policySEXP, SEXP int64_r_typeSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const std::string& >::type file_path(file_pathSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type file_path(file_pathSEXP);
     Rcpp::traits::input_parameter< const std::string& >::type json_pointer(json_pointerSEXP);
     Rcpp::traits::input_parameter< SEXP >::type empty_array(empty_arraySEXP);
     Rcpp::traits::input_parameter< SEXP >::type empty_object(empty_objectSEXP);
diff --git a/src/deserialize.cpp b/src/deserialize.cpp
index 8d10fe0..f37f693 100644
--- a/src/deserialize.cpp
+++ b/src/deserialize.cpp
@@ -1,9 +1,74 @@
 #include <RcppSimdJson.hpp>
 
 
+SEXP deserialize_single_string(const Rcpp::CharacterVector& json,
+                               const std::string& json_pointer,
+                               SEXP empty_array,
+                               SEXP empty_object,
+                               const rcppsimdjson::deserialize::Simplify_To simplify_to,
+                               const rcppsimdjson::deserialize::Type_Policy type_policy,
+                               const rcppsimdjson::utils::Int64_R_Type int64_r_type) {
+  using namespace rcppsimdjson;
+
+  simdjson::dom::parser parser;
+  auto [parsed, error] = json_pointer.empty() //
+                             ? parser.parse(std::string(json[0]))
+                             : parser.parse(std::string(json[0])).at(json_pointer);
+  if (error) {
+    Rcpp::stop(simdjson::error_message(error));
+  }
+
+  return deserialize::deserialize( //
+      parsed,                      //
+      empty_array,                 //
+      empty_object,                //
+      simplify_to,                 //
+      type_policy,                 //
+      int64_r_type                 //
+  );                               //
+}
+
+
+SEXP deserialize_multiple_strings(const Rcpp::CharacterVector& json,
+                                  const std::string& json_pointer,
+                                  SEXP empty_array,
+                                  SEXP empty_object,
+                                  const rcppsimdjson::deserialize::Simplify_To simplify_to,
+                                  const rcppsimdjson::deserialize::Type_Policy type_policy,
+                                  const rcppsimdjson::utils::Int64_R_Type int64_r_type) {
+  using namespace rcppsimdjson;
+
+  const auto n = json.length();
+  Rcpp::List out(n);
+  simdjson::dom::parser parser;
+
+  for (R_xlen_t i = 0; i < n; ++i) {
+    auto [parsed, error] = json_pointer.empty() //
+                               ? parser.parse(std::string(json[i]))
+                               : parser.parse(std::string(json[i])).at(json_pointer);
+    if (error) {
+      Rcpp::stop("%s\n\nValue affected:\n\t- `json[[%d]]`", simdjson::error_message(error), i + 1);
+    }
+
+    out[i] = deserialize::deserialize( //
+        parsed,                        //
+        empty_array,                   //
+        empty_object,                  //
+        simplify_to,                   //
+        type_policy,                   //
+        int64_r_type                   //
+    );                                 //
+  }
+
+  out.attr("names") = json.attr("names");
+
+  return out;
+}
+
+
 //' Deserialize JSON into R Objects
 //'
-//' @param json \code{character(1L)}
+//' @param json \code{character()} containing one or more strings of JSON data.
 //'
 //' @param json_pointer \code{character(1L)}, default: \code{""}
 //'
@@ -11,72 +76,202 @@
 //'
 //' @param empty_object default: \code{NULL}. Any R object to return for empty JSON objects.
 //'
-//' @param simplify_to default: \code{0}. Maximum simplification level.
-//'   0=dataframe, 1=matrix, 2=vector, 3=list
+//' @param simplify_to \code{integer(1L)}, default: \code{0L}.
+//'     Maximum simplification level.
+//'     0: data frame, 1: matrix, 2: vector, 3: list (no simplification)
+//'
+//' @param type_policy \code{integer(1L)}, default: \code{0L}.
+//'     Level of type strictness.
+//'     0: merge everything, 1: merge numbers, 2: strict (mixed types are not merged)
 //'
-//' @param type_policy default: \code{0}. Level of type strictness.
-//'   0=anything goes, 1=merge integers/doubles, 2=strict
+//' @param int64_r_type \code{integer(1L)} default: \code{0L}
+//'     How to return big integers to R.
+//'     0: \code{double}, 1: string, 2: \code{bit64::integer64}-compatible number
 //'
-//' @param int64_r_type default: \code{0} How to return big integers to R.
-//'   0=double, 1=string, 2=bit64::integer64
+//' @details
+//' Instead of using \code{lapply()} for vectors containing multiple strings/file paths,
+//'     just use \code{.deserialize_json()} and \code{.load_json()} directly as they are vectorized
+//'     (in the R sense). This is much more efficient as the underlying \code{simdjson::parser} can
+//'     reuse internal buffers between parses. Since the overwhelming majority of JSON objects
+//'     parsed will not result in R scalars, a \code{list()} is always returned when multiple items
+//'     are passed to \code{.deserialize_json()} or \code{.load_json()}. Also in keeping with
+//'     \code{lapply()}'s behavior, if the data passed has \code{names()}, the returned object will
+//'     have the same names.
 //'
 //' @keywords internal
 //'
+//' @examples
+//' # .deserialize_json() ======================================================
+//' RcppSimdJson:::.deserialize_json('[[1,2,3],[4,5,6]]')
+//'
+//' RcppSimdJson:::.deserialize_json(
+//'   '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+//' )
+//'
+//' RcppSimdJson:::.deserialize_json(
+//'   c(
+//'     json1 = "[[1,2,3],[4,5,6]]",
+//'     json2 = '[{"a":1,"b":true},{"a":2,"b":false,"c":null}]'
+//'   )
+//' )
+//'
 // [[Rcpp::export(.deserialize_json)]]
-SEXP deserialize_json(const Rcpp::String& json,
+SEXP deserialize_json(const Rcpp::CharacterVector& json,
                       const std::string& json_pointer = "",
                       SEXP empty_array = R_NilValue,
                       SEXP empty_object = R_NilValue,
                       const int simplify_to = 0,
                       const int type_policy = 0,
                       const int int64_r_type = 0) {
-  using namespace rcppsimdjson;
+  switch (json.length()) {
+    case 0:
+      return R_NilValue;
 
-  simdjson::dom::parser parser;
+    case 1:
+      return deserialize_single_string(                                     //
+          json,                                                             //
+          json_pointer,                                                     //
+          empty_array,                                                      //
+          empty_object,                                                     //
+          static_cast<rcppsimdjson::deserialize::Simplify_To>(simplify_to), //
+          static_cast<rcppsimdjson::deserialize::Type_Policy>(type_policy), //
+          static_cast<rcppsimdjson::utils::Int64_R_Type>(int64_r_type)      //
+      );                                                                    //
 
-  auto [parsed, error] = json_pointer.empty() //
-                             ? parser.parse(json)
-                             : parser.parse(json).at(json_pointer);
+    default:
+      return deserialize_multiple_strings(                                  //
+          json,                                                             //
+          json_pointer,                                                     //
+          empty_array,                                                      //
+          empty_object,                                                     //
+          static_cast<rcppsimdjson::deserialize::Simplify_To>(simplify_to), //
+          static_cast<rcppsimdjson::deserialize::Type_Policy>(type_policy), //
+          static_cast<rcppsimdjson::utils::Int64_R_Type>(int64_r_type)      //
+      );                                                                    //
+  }
 
+  return R_NilValue;
+}
+
+
+SEXP load_single_file(const Rcpp::String& file_path,
+                      const std::string& json_pointer,
+                      SEXP empty_array,
+                      SEXP empty_object,
+                      const rcppsimdjson::deserialize::Simplify_To simplify_to,
+                      const rcppsimdjson::deserialize::Type_Policy type_policy,
+                      const rcppsimdjson::utils::Int64_R_Type int64_r_type) {
+  simdjson::dom::parser parser;
+  auto [parsed, error] = json_pointer.empty() //
+                             ? parser.load(std::string(file_path))
+                             : parser.load(std::string(file_path)).at(json_pointer);
   if (error) {
     Rcpp::stop(simdjson::error_message(error));
   }
 
-  return deserialize::deserialize(parsed,
-                                  empty_array,
-                                  empty_object,
-                                  static_cast<deserialize::Simplify_To>(simplify_to),
-                                  static_cast<deserialize::Type_Policy>(type_policy),
-                                  static_cast<utils::Int64_R_Type>(int64_r_type));
+  return rcppsimdjson::deserialize::deserialize( //
+      parsed,                                    //
+      empty_array,                               //
+      empty_object,                              //
+      simplify_to,                               //
+      type_policy,                               //
+      int64_r_type                               //
+  );                                             //
+}
+
+
+SEXP load_multiple_files(const Rcpp::CharacterVector& file_path,
+                         const std::string& json_pointer,
+                         SEXP empty_array,
+                         SEXP empty_object,
+                         const rcppsimdjson::deserialize::Simplify_To simplify_to,
+                         const rcppsimdjson::deserialize::Type_Policy type_policy,
+                         const rcppsimdjson::utils::Int64_R_Type int64_r_type) {
+  const auto n = file_path.length();
+  auto out = Rcpp::List(n);
+  simdjson::dom::parser parser;
+
+  for (R_xlen_t i = 0; i < n; ++i) {
+    auto [parsed, error] = json_pointer.empty() //
+                               ? parser.load(std::string(file_path[i]))
+                               : parser.load(std::string(file_path[i])).at(json_pointer);
+    if (error) {
+      Rcpp::stop("%s\n\nFile affected:\n\t- %s", //
+                 simdjson::error_message(error), //
+                 std::string(file_path[i]));     //
+    }
+
+    out[i] = rcppsimdjson::deserialize::deserialize( //
+        parsed,                                      //
+        empty_array,                                 //
+        empty_object,                                //
+        simplify_to,                                 //
+        type_policy,                                 //
+        int64_r_type                                 //
+    );                                               //
+  }
+
+  out.attr("names") = file_path.attr("names");
+  return out;
 }
 
 
+//' @rdname dot-deserialize_json
+//'
+//' @param file_path \code{character()} containing one or more paths to files containing
+//'    JSON data.
+//'
+//' @examples
+//' # .load_json() =============================================================
+//' single_file <- system.file("jsonexamples", "small", "flatadversarial.json",
+//'                            package = "RcppSimdJson")
+//' RcppSimdJson:::.load_json(single_file)
+//'
+//' multiple_files <- vapply(
+//'   c("flatadversarial.json", "adversarial.json"),
+//'   function(.x) {
+//'     system.file("jsonexamples/small", .x, package = "RcppSimdJson")
+//'   },
+//'   character(1L)
+//' )
+//' RcppSimdJson:::.load_json(multiple_files)
+//'
 // [[Rcpp::export(.load_json)]]
-SEXP load_json(const std::string& file_path,
+SEXP load_json(const Rcpp::CharacterVector& file_path,
                const std::string& json_pointer = "",
                SEXP empty_array = R_NilValue,
                SEXP empty_object = R_NilValue,
                const int simplify_to = 0,
                const int type_policy = 0,
                const int int64_r_type = 0) {
-  using namespace rcppsimdjson;
-
-  simdjson::dom::parser parser;
+  switch (file_path.length()) {
+    case 0:
+      return R_NilValue;
 
-  auto [parsed, error] = json_pointer.empty() //
-                             ? parser.load(file_path)
-                             : parser.load(file_path).at(json_pointer);
+    case 1:
+      return load_single_file(                                              //
+          file_path[0],                                                     //
+          json_pointer,                                                     //
+          empty_array,                                                      //
+          empty_object,                                                     //
+          static_cast<rcppsimdjson::deserialize::Simplify_To>(simplify_to), //
+          static_cast<rcppsimdjson::deserialize::Type_Policy>(type_policy), //
+          static_cast<rcppsimdjson::utils::Int64_R_Type>(int64_r_type)      //
+      );                                                                    //
 
-  if (error) {
-    Rcpp::stop(simdjson::error_message(error));
+    default:
+      return load_multiple_files(                                           //
+          file_path,                                                        //
+          json_pointer,                                                     //
+          empty_array,                                                      //
+          empty_object,                                                     //
+          static_cast<rcppsimdjson::deserialize::Simplify_To>(simplify_to), //
+          static_cast<rcppsimdjson::deserialize::Type_Policy>(type_policy), //
+          static_cast<rcppsimdjson::utils::Int64_R_Type>(int64_r_type)      //
+      );                                                                    //
   }
 
-  return deserialize::deserialize(parsed,
-                                  empty_array,
-                                  empty_object,
-                                  static_cast<deserialize::Simplify_To>(simplify_to),
-                                  static_cast<deserialize::Type_Policy>(type_policy),
-                                  static_cast<utils::Int64_R_Type>(int64_r_type));
+  return R_NilValue;
 }