diff --git a/GNUmakefile b/GNUmakefile
index eaac008523a..2ac18fd3aba 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -101,7 +101,7 @@ tool_targets += tools/tracemerge
endif
# FVTest Helper Libraries
-test_prereqs := third_party/pugixml-1.5 fvtest/util fvtest/omrGtestGlue
+test_prereqs := third_party/pugixml-1.8 fvtest/util fvtest/omrGtestGlue
test_targets += $(test_prereqs)
# Utility Libraries
diff --git a/longabout.html b/longabout.html
index fd9b7fc5b02..153c323864a 100644
--- a/longabout.html
+++ b/longabout.html
@@ -66,8 +66,8 @@
Google C++ Testing Framework 1.7.0
The source is available at https://github.com/google/googletest.
-pugixml 1.5
-Copyright (c) 2006-2015 Arseny Kapoulkine
+pugixml 1.8
+Copyright (c) 2006-2016 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
@@ -88,7 +88,7 @@
pugixml 1.5
THE SOFTWARE.
-The source is available at http://pugixml.org/2014/11/27/pugixml-1.5-release.html.
+The source is available at https://github.com/zeux/pugixml/releases/tag/v1.8.
config.sub and config.guess
diff --git a/omrmakefiles/configure.mk.in b/omrmakefiles/configure.mk.in
index 0d85e1b915a..f163b145c24 100644
--- a/omrmakefiles/configure.mk.in
+++ b/omrmakefiles/configure.mk.in
@@ -216,7 +216,7 @@ GLOBAL_LIBPATH += . $(exe_output_dir) $(lib_output_dir)
# Location of fvtest framework
OMR_GTEST_DIR := $(top_srcdir)/third_party/gtest-1.8.0
OMR_GTEST_INCLUDES := $(OMR_GTEST_DIR) $(OMR_GTEST_DIR)/include $(top_srcdir)/fvtest/omrGtestGlue
-OMR_PUGIXML_DIR := $(top_srcdir)/third_party/pugixml-1.5
+OMR_PUGIXML_DIR := $(top_srcdir)/third_party/pugixml-1.8
# googletest code requires exception handling
ifeq (linux,$(OMR_HOST_OS))
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 5fbb1fff868..e25e1211b53 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -20,5 +20,5 @@
# SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0
###############################################################################
-add_subdirectory(pugixml-1.5)
+add_subdirectory(pugixml-1.8)
# TODO: gtest/gmock
diff --git a/third_party/pugixml-1.5/makefile b/third_party/pugixml-1.5/makefile
deleted file mode 100644
index a83b0d090a9..00000000000
--- a/third_party/pugixml-1.5/makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-###############################################################################
-# Copyright IBM Corp. and others 2015
-#
-# This program and the accompanying materials are made available under
-# the terms of the Eclipse Public License 2.0 which accompanies this
-# distribution and is available at https://www.eclipse.org/legal/epl-2.0/
-# or the Apache License, Version 2.0 which accompanies this distribution and
-# is available at https://www.apache.org/licenses/LICENSE-2.0.
-#
-# This Source Code may also be made available under the following
-# Secondary Licenses when the conditions for such availability set
-# forth in the Eclipse Public License, v. 2.0 are satisfied: GNU
-# General Public License, version 2 with the GNU Classpath
-# Exception [1] and GNU General Public License, version 2 with the
-# OpenJDK Assembly Exception [2].
-#
-# [1] https://www.gnu.org/software/classpath/license.html
-# [2] https://openjdk.org/legal/assembly-exception.html
-#
-# SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0
-###############################################################################
-
-top_srcdir := ../..
-include $(top_srcdir)/omrmakefiles/configure.mk
-
-MODULE_NAME := pugixml
-ARTIFACT_TYPE := archive
-OBJECTS := pugixml$(OBJEXT)
-
-include $(top_srcdir)/omrmakefiles/rules.mk
-
diff --git a/third_party/pugixml-1.5/CMakeLists.txt b/third_party/pugixml-1.8/CMakeLists.txt
similarity index 100%
rename from third_party/pugixml-1.5/CMakeLists.txt
rename to third_party/pugixml-1.8/CMakeLists.txt
diff --git a/third_party/pugixml-1.5/pugiconfig.hpp b/third_party/pugixml-1.8/pugiconfig.hpp
similarity index 92%
rename from third_party/pugixml-1.5/pugiconfig.hpp
rename to third_party/pugixml-1.8/pugiconfig.hpp
index 26b1ef6dc97..52aa7e10c97 100644
--- a/third_party/pugixml-1.5/pugiconfig.hpp
+++ b/third_party/pugixml-1.8/pugiconfig.hpp
@@ -1,7 +1,7 @@
/**
- * pugixml parser - version 1.5
+ * pugixml parser - version 1.8
* --------------------------------------------------------
- * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2016, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@@ -17,6 +17,9 @@
// Uncomment this to enable wchar_t mode
// #define PUGIXML_WCHAR_MODE
+// Uncomment this to enable compact mode
+// #define PUGIXML_COMPACT
+
// Uncomment this to disable XPath
// #define PUGIXML_NO_XPATH
@@ -39,7 +42,6 @@
// Uncomment this to switch to header-only version
// #define PUGIXML_HEADER_ONLY
-// #include "pugixml.cpp"
// Uncomment this to enable long long support
// #define PUGIXML_HAS_LONG_LONG
@@ -47,7 +49,7 @@
#endif
/**
- * Copyright (c) 2006-2014 Arseny Kapoulkine
+ * Copyright (c) 2006-2016 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -60,7 +62,7 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
diff --git a/third_party/pugixml-1.5/pugixml.cpp b/third_party/pugixml-1.8/pugixml.cpp
similarity index 80%
rename from third_party/pugixml-1.5/pugixml.cpp
rename to third_party/pugixml-1.8/pugixml.cpp
index 59a2ec7b255..cac51a53426 100644
--- a/third_party/pugixml-1.5/pugixml.cpp
+++ b/third_party/pugixml-1.8/pugixml.cpp
@@ -1,7 +1,7 @@
/**
- * pugixml parser - version 1.5
+ * pugixml parser - version 1.8
* --------------------------------------------------------
- * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2006-2016, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@@ -11,12 +11,6 @@
* Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
*/
-/*
- * ===========================================================================
- * Copyright IBM Corp. and others 2018
- * ===========================================================================
- */
-
#ifndef SOURCE_PUGIXML_CPP
#define SOURCE_PUGIXML_CPP
@@ -26,6 +20,7 @@
#include
#include
#include
+#include
#ifdef PUGIXML_WCHAR_MODE
# include
@@ -59,7 +54,7 @@
#endif
#ifdef __INTEL_COMPILER
-# pragma warning(disable: 177) // function was declared but never referenced
+# pragma warning(disable: 177) // function was declared but never referenced
# pragma warning(disable: 279) // controlling expression is constant
# pragma warning(disable: 1478 1786) // function was declared "deprecated"
# pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
@@ -87,7 +82,7 @@
#elif defined(__GNUC__)
# define PUGI__NO_INLINE __attribute__((noinline))
#else
-# define PUGI__NO_INLINE
+# define PUGI__NO_INLINE
#endif
// Branch weight controls
@@ -111,6 +106,14 @@
#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
using std::memcpy;
using std::memmove;
+using std::memset;
+#endif
+
+// Some MinGW versions have headers that erroneously omit LLONG_MIN/LLONG_MAX/ULLONG_MAX definitions in strict ANSI mode
+#if defined(PUGIXML_HAS_LONG_LONG) && defined(__MINGW32__) && defined(__STRICT_ANSI__) && !defined(LLONG_MAX) && !defined(LLONG_MIN) && !defined(ULLONG_MAX)
+# define LLONG_MAX 9223372036854775807LL
+# define LLONG_MIN (-LLONG_MAX-1)
+# define ULLONG_MAX (2ULL*LLONG_MAX+1)
#endif
// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
@@ -136,25 +139,21 @@ using std::memmove;
#endif
// uintptr_t
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-# include
-#else
+#if (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(__BORLANDC__) && __BORLANDC__ < 0x561)
+namespace pugi
+{
# ifndef _UINTPTR_T_DEFINED
-// No native uintptr_t in MSVC6 and in some WinCE versions
-typedef size_t uintptr_t;
-#define _UINTPTR_T_DEFINED
+ typedef size_t uintptr_t;
# endif
-PUGI__NS_BEGIN
+
typedef unsigned __int8 uint8_t;
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
-PUGI__NS_END
+}
+#else
+# include
#endif
-#if __cplusplus < 201103L
-#define snprintf(buf, buf_size, format, ...) sprintf(buf, format, __VA_ARGS__)
-#endif /* __cplusplus < 201103L */
-
// Memory allocation
PUGI__NS_BEGIN
PUGI__FN void* default_allocate(size_t size)
@@ -214,7 +213,7 @@ PUGI__NS_BEGIN
for (size_t i = 0; i < count; ++i)
if (lhs[i] != rhs[i])
return false;
-
+
return lhs[count] == 0;
}
@@ -231,63 +230,206 @@ PUGI__NS_BEGIN
return static_cast(end - s);
#endif
}
-
-#ifdef PUGIXML_WCHAR_MODE
- // Convert string to wide string, assuming all symbols are ASCII
- PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
- {
- for (const char* i = source; *i; ++i) *dest++ = *i;
- *dest = 0;
- }
-#endif
PUGI__NS_END
-#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
-// auto_ptr-like buffer holder for exception recovery
+// auto_ptr-like object for exception recovery
PUGI__NS_BEGIN
- struct buffer_holder
+ template struct auto_deleter
{
- void* data;
- void (*deleter)(void*);
+ typedef void (*D)(T*);
- buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+ T* data;
+ D deleter;
+
+ auto_deleter(T* data_, D deleter_): data(data_), deleter(deleter_)
{
}
- ~buffer_holder()
+ ~auto_deleter()
{
if (data) deleter(data);
}
- void* release()
+ T* release()
{
- void* result = data;
+ T* result = data;
data = 0;
return result;
}
};
PUGI__NS_END
+
+#ifdef PUGIXML_COMPACT
+PUGI__NS_BEGIN
+ class compact_hash_table
+ {
+ public:
+ compact_hash_table(): _items(0), _capacity(0), _count(0)
+ {
+ }
+
+ void clear()
+ {
+ if (_items)
+ {
+ xml_memory::deallocate(_items);
+ _items = 0;
+ _capacity = 0;
+ _count = 0;
+ }
+ }
+
+ void** find(const void* key)
+ {
+ assert(key);
+
+ if (_capacity == 0) return 0;
+
+ size_t hashmod = _capacity - 1;
+ size_t bucket = hash(key) & hashmod;
+
+ for (size_t probe = 0; probe <= hashmod; ++probe)
+ {
+ item_t& probe_item = _items[bucket];
+
+ if (probe_item.key == key)
+ return &probe_item.value;
+
+ if (probe_item.key == 0)
+ return 0;
+
+ // hash collision, quadratic probing
+ bucket = (bucket + probe + 1) & hashmod;
+ }
+
+ assert(false && "Hash table is full");
+ return 0;
+ }
+
+ void** insert(const void* key)
+ {
+ assert(key);
+ assert(_capacity != 0 && _count < _capacity - _capacity / 4);
+
+ size_t hashmod = _capacity - 1;
+ size_t bucket = hash(key) & hashmod;
+
+ for (size_t probe = 0; probe <= hashmod; ++probe)
+ {
+ item_t& probe_item = _items[bucket];
+
+ if (probe_item.key == 0)
+ {
+ probe_item.key = key;
+ _count++;
+ return &probe_item.value;
+ }
+
+ if (probe_item.key == key)
+ return &probe_item.value;
+
+ // hash collision, quadratic probing
+ bucket = (bucket + probe + 1) & hashmod;
+ }
+
+ assert(false && "Hash table is full");
+ return 0;
+ }
+
+ bool reserve()
+ {
+ if (_count + 16 >= _capacity - _capacity / 4)
+ return rehash();
+
+ return true;
+ }
+
+ private:
+ struct item_t
+ {
+ const void* key;
+ void* value;
+ };
+
+ item_t* _items;
+ size_t _capacity;
+
+ size_t _count;
+
+ bool rehash();
+
+ static unsigned int hash(const void* key)
+ {
+ unsigned int h = static_cast(reinterpret_cast(key));
+
+ // MurmurHash3 32-bit finalizer
+ h ^= h >> 16;
+ h *= 0x85ebca6bu;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35u;
+ h ^= h >> 16;
+
+ return h;
+ }
+ };
+
+ PUGI__FN_NO_INLINE bool compact_hash_table::rehash()
+ {
+ compact_hash_table rt;
+ rt._capacity = (_capacity == 0) ? 32 : _capacity * 2;
+ rt._items = static_cast(xml_memory::allocate(sizeof(item_t) * rt._capacity));
+
+ if (!rt._items)
+ return false;
+
+ memset(rt._items, 0, sizeof(item_t) * rt._capacity);
+
+ for (size_t i = 0; i < _capacity; ++i)
+ if (_items[i].key)
+ *rt.insert(_items[i].key) = _items[i].value;
+
+ if (_items)
+ xml_memory::deallocate(_items);
+
+ _capacity = rt._capacity;
+ _items = rt._items;
+
+ assert(_count == rt._count);
+
+ return true;
+ }
+
+PUGI__NS_END
#endif
PUGI__NS_BEGIN
- static const size_t xml_memory_page_size =
- #ifdef PUGIXML_MEMORY_PAGE_SIZE
- PUGIXML_MEMORY_PAGE_SIZE
- #else
- 32768
- #endif
- ;
+#ifdef PUGIXML_COMPACT
+ static const uintptr_t xml_memory_block_alignment = 4;
+#else
+ static const uintptr_t xml_memory_block_alignment = sizeof(void*);
+#endif
- static const uintptr_t xml_memory_page_alignment = 64;
- static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
- static const uintptr_t xml_memory_page_contents_shared_mask = 32;
- static const uintptr_t xml_memory_page_name_allocated_mask = 16;
- static const uintptr_t xml_memory_page_value_allocated_mask = 8;
- static const uintptr_t xml_memory_page_type_mask = 7;
+ // extra metadata bits
+ static const uintptr_t xml_memory_page_contents_shared_mask = 64;
+ static const uintptr_t xml_memory_page_name_allocated_mask = 32;
+ static const uintptr_t xml_memory_page_value_allocated_mask = 16;
+ static const uintptr_t xml_memory_page_type_mask = 15;
+
+ // combined masks for string uniqueness
static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask;
static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask;
- #define PUGI__NODETYPE(n) static_cast(((n)->header & impl::xml_memory_page_type_mask) + 1)
+#ifdef PUGIXML_COMPACT
+ #define PUGI__GETHEADER_IMPL(object, page, flags) // unused
+ #define PUGI__GETPAGE_IMPL(header) (header).get_page()
+#else
+ #define PUGI__GETHEADER_IMPL(object, page, flags) (((reinterpret_cast(object) - reinterpret_cast(page)) << 8) | (flags))
+ // this macro casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+ #define PUGI__GETPAGE_IMPL(header) static_cast(const_cast(static_cast(reinterpret_cast(&header) - (header >> 8))))
+#endif
+
+ #define PUGI__GETPAGE(n) PUGI__GETPAGE_IMPL((n)->header)
+ #define PUGI__NODETYPE(n) static_cast((n)->header & impl::xml_memory_page_type_mask)
struct xml_allocator;
@@ -303,6 +445,12 @@ PUGI__NS_BEGIN
result->busy_size = 0;
result->freed_size = 0;
+ #ifdef PUGIXML_COMPACT
+ result->compact_string_base = 0;
+ result->compact_shared_parent = 0;
+ result->compact_page_marker = 0;
+ #endif
+
return result;
}
@@ -313,8 +461,22 @@ PUGI__NS_BEGIN
size_t busy_size;
size_t freed_size;
+
+ #ifdef PUGIXML_COMPACT
+ char_t* compact_string_base;
+ void* compact_shared_parent;
+ uint32_t* compact_page_marker;
+ #endif
};
+ static const size_t xml_memory_page_size =
+ #ifdef PUGIXML_MEMORY_PAGE_SIZE
+ (PUGIXML_MEMORY_PAGE_SIZE)
+ #else
+ 32768
+ #endif
+ - sizeof(xml_memory_page);
+
struct xml_memory_string_header
{
uint16_t page_offset; // offset from page->data
@@ -325,6 +487,9 @@ PUGI__NS_BEGIN
{
xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
{
+ #ifdef PUGIXML_COMPACT
+ _hash = 0;
+ #endif
}
xml_memory_page* allocate_page(size_t data_size)
@@ -332,37 +497,29 @@ PUGI__NS_BEGIN
size_t size = sizeof(xml_memory_page) + data_size;
// allocate block with some alignment, leaving memory for worst-case padding
- void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+ void* memory = xml_memory::allocate(size);
if (!memory) return 0;
- // align to next page boundary (note: this guarantees at least 1 usable byte before the page)
- char* page_memory = reinterpret_cast((reinterpret_cast(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1));
-
// prepare page structure
- xml_memory_page* page = xml_memory_page::construct(page_memory);
+ xml_memory_page* page = xml_memory_page::construct(memory);
assert(page);
page->allocator = _root->allocator;
- // record the offset for freeing the memory block
- assert(page_memory > memory && page_memory - static_cast(memory) <= 127);
- page_memory[-1] = static_cast(page_memory - static_cast(memory));
-
return page;
}
static void deallocate_page(xml_memory_page* page)
{
- char* page_memory = reinterpret_cast(page);
-
- xml_memory::deallocate(page_memory - page_memory[-1]);
+ xml_memory::deallocate(page);
}
void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
void* allocate_memory(size_t size, xml_memory_page*& out_page)
{
- if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+ if (PUGI__UNLIKELY(_busy_size + size > xml_memory_page_size))
+ return allocate_memory_oob(size, out_page);
void* buf = reinterpret_cast(_root) + sizeof(xml_memory_page) + _busy_size;
@@ -373,6 +530,44 @@ PUGI__NS_BEGIN
return buf;
}
+ #ifdef PUGIXML_COMPACT
+ void* allocate_object(size_t size, xml_memory_page*& out_page)
+ {
+ void* result = allocate_memory(size + sizeof(uint32_t), out_page);
+ if (!result) return 0;
+
+ // adjust for marker
+ ptrdiff_t offset = static_cast(result) - reinterpret_cast(out_page->compact_page_marker);
+
+ if (PUGI__UNLIKELY(static_cast(offset) >= 256 * xml_memory_block_alignment))
+ {
+ // insert new marker
+ uint32_t* marker = static_cast(result);
+
+ *marker = static_cast(reinterpret_cast(marker) - reinterpret_cast(out_page));
+ out_page->compact_page_marker = marker;
+
+ // since we don't reuse the page space until we reallocate it, we can just pretend that we freed the marker block
+ // this will make sure deallocate_memory correctly tracks the size
+ out_page->freed_size += sizeof(uint32_t);
+
+ return marker + 1;
+ }
+ else
+ {
+ // roll back uint32_t part
+ _busy_size -= sizeof(uint32_t);
+
+ return result;
+ }
+ }
+ #else
+ void* allocate_object(size_t size, xml_memory_page*& out_page)
+ {
+ return allocate_memory(size, out_page);
+ }
+ #endif
+
void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
{
if (page == _root) page->busy_size = _busy_size;
@@ -390,7 +585,16 @@ PUGI__NS_BEGIN
assert(_root == page);
// top page freed, just reset sizes
- page->busy_size = page->freed_size = 0;
+ page->busy_size = 0;
+ page->freed_size = 0;
+
+ #ifdef PUGIXML_COMPACT
+ // reset compact state to maximize efficiency
+ page->compact_string_base = 0;
+ page->compact_shared_parent = 0;
+ page->compact_page_marker = 0;
+ #endif
+
_busy_size = 0;
}
else
@@ -410,13 +614,15 @@ PUGI__NS_BEGIN
char_t* allocate_string(size_t length)
{
- PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16));
+ static const size_t max_encoded_offset = (1 << 16) * xml_memory_block_alignment;
+
+ PUGI__STATIC_ASSERT(xml_memory_page_size <= max_encoded_offset);
// allocate memory for string and header block
size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
-
- // round size up to pointer alignment boundary
- size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+ // round size up to block alignment boundary
+ size_t full_size = (size + (xml_memory_block_alignment - 1)) & ~(xml_memory_block_alignment - 1);
xml_memory_page* page;
xml_memory_string_header* header = static_cast(allocate_memory(full_size, page));
@@ -426,12 +632,14 @@ PUGI__NS_BEGIN
// setup header
ptrdiff_t page_offset = reinterpret_cast(header) - reinterpret_cast(page) - sizeof(xml_memory_page);
- assert(page_offset >= 0 && page_offset < (1 << 16));
- header->page_offset = static_cast(page_offset);
+ assert(page_offset % xml_memory_block_alignment == 0);
+ assert(page_offset >= 0 && static_cast(page_offset) < max_encoded_offset);
+ header->page_offset = static_cast(static_cast(page_offset) / xml_memory_block_alignment);
// full_size == 0 for large strings that occupy the whole page
- assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
- header->full_size = static_cast(full_size < (1 << 16) ? full_size : 0);
+ assert(full_size % xml_memory_block_alignment == 0);
+ assert(full_size < max_encoded_offset || (page->busy_size == full_size && page_offset == 0));
+ header->full_size = static_cast(full_size < max_encoded_offset ? full_size / xml_memory_block_alignment : 0);
// round-trip through void* to avoid 'cast increases required alignment of target type' warning
// header is guaranteed a pointer-sized alignment, which should be enough for char_t
@@ -448,102 +656,458 @@ PUGI__NS_BEGIN
assert(header);
// deallocate
- size_t page_offset = sizeof(xml_memory_page) + header->page_offset;
+ size_t page_offset = sizeof(xml_memory_page) + header->page_offset * xml_memory_block_alignment;
xml_memory_page* page = reinterpret_cast(static_cast(reinterpret_cast(header) - page_offset));
// if full_size == 0 then this string occupies the whole page
- size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+ size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size * xml_memory_block_alignment;
deallocate_memory(header, full_size, page);
}
+ bool reserve()
+ {
+ #ifdef PUGIXML_COMPACT
+ return _hash->reserve();
+ #else
+ return true;
+ #endif
+ }
+
xml_memory_page* _root;
size_t _busy_size;
+
+ #ifdef PUGIXML_COMPACT
+ compact_hash_table* _hash;
+ #endif
};
PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
{
const size_t large_allocation_threshold = xml_memory_page_size / 4;
- xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
- out_page = page;
+ xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+ out_page = page;
+
+ if (!page) return 0;
+
+ if (size <= large_allocation_threshold)
+ {
+ _root->busy_size = _busy_size;
+
+ // insert page at the end of linked list
+ page->prev = _root;
+ _root->next = page;
+ _root = page;
+
+ _busy_size = size;
+ }
+ else
+ {
+ // insert page before the end of linked list, so that it is deleted as soon as possible
+ // the last page is not deleted even if it's empty (see deallocate_memory)
+ assert(_root->prev);
+
+ page->prev = _root->prev;
+ page->next = _root;
+
+ _root->prev->next = page;
+ _root->prev = page;
+
+ page->busy_size = size;
+ }
+
+ return reinterpret_cast(page) + sizeof(xml_memory_page);
+ }
+PUGI__NS_END
+
+#ifdef PUGIXML_COMPACT
+PUGI__NS_BEGIN
+ static const uintptr_t compact_alignment_log2 = 2;
+ static const uintptr_t compact_alignment = 1 << compact_alignment_log2;
+
+ class compact_header
+ {
+ public:
+ compact_header(xml_memory_page* page, unsigned int flags)
+ {
+ PUGI__STATIC_ASSERT(xml_memory_block_alignment == compact_alignment);
+
+ ptrdiff_t offset = (reinterpret_cast(this) - reinterpret_cast(page->compact_page_marker));
+ assert(offset % compact_alignment == 0 && static_cast(offset) < 256 * compact_alignment);
+
+ _page = static_cast(offset >> compact_alignment_log2);
+ _flags = static_cast(flags);
+ }
+
+ void operator&=(uintptr_t mod)
+ {
+ _flags &= static_cast(mod);
+ }
+
+ void operator|=(uintptr_t mod)
+ {
+ _flags |= static_cast(mod);
+ }
+
+ uintptr_t operator&(uintptr_t mod) const
+ {
+ return _flags & mod;
+ }
+
+ xml_memory_page* get_page() const
+ {
+ // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+ const char* page_marker = reinterpret_cast(this) - (_page << compact_alignment_log2);
+ const char* page = page_marker - *reinterpret_cast(static_cast(page_marker));
+
+ return const_cast(reinterpret_cast(static_cast(page)));
+ }
+
+ private:
+ unsigned char _page;
+ unsigned char _flags;
+ };
+
+ PUGI__FN xml_memory_page* compact_get_page(const void* object, int header_offset)
+ {
+ const compact_header* header = reinterpret_cast(static_cast(object) - header_offset);
+
+ return header->get_page();
+ }
+
+ template PUGI__FN_NO_INLINE T* compact_get_value(const void* object)
+ {
+ return static_cast(*compact_get_page(object, header_offset)->allocator->_hash->find(object));
+ }
+
+ template PUGI__FN_NO_INLINE void compact_set_value(const void* object, T* value)
+ {
+ *compact_get_page(object, header_offset)->allocator->_hash->insert(object) = value;
+ }
+
+ template class compact_pointer
+ {
+ public:
+ compact_pointer(): _data(0)
+ {
+ }
+
+ void operator=(const compact_pointer& rhs)
+ {
+ *this = rhs + 0;
+ }
+
+ void operator=(T* value)
+ {
+ if (value)
+ {
+ // value is guaranteed to be compact-aligned; 'this' is not
+ // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*)
+ // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to
+ // compensate for arithmetic shift rounding for negative values
+ ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this);
+ ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) - start;
+
+ if (static_cast(offset) <= 253)
+ _data = static_cast(offset + 1);
+ else
+ {
+ compact_set_value(this, value);
+
+ _data = 255;
+ }
+ }
+ else
+ _data = 0;
+ }
+
+ operator T*() const
+ {
+ if (_data)
+ {
+ if (_data < 255)
+ {
+ uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1);
+
+ return reinterpret_cast(base + ((_data - 1 + start) << compact_alignment_log2));
+ }
+ else
+ return compact_get_value(this);
+ }
+ else
+ return 0;
+ }
+
+ T* operator->() const
+ {
+ return *this;
+ }
+
+ private:
+ unsigned char _data;
+ };
+
+ template class compact_pointer_parent
+ {
+ public:
+ compact_pointer_parent(): _data(0)
+ {
+ }
+
+ void operator=(const compact_pointer_parent& rhs)
+ {
+ *this = rhs + 0;
+ }
+
+ void operator=(T* value)
+ {
+ if (value)
+ {
+ // value is guaranteed to be compact-aligned; 'this' is not
+ // our decoding is based on 'this' aligned to compact alignment downwards (see operator T*)
+ // so for negative offsets (e.g. -3) we need to adjust the diff by compact_alignment - 1 to
+ // compensate for arithmetic shift behavior for negative values
+ ptrdiff_t diff = reinterpret_cast(value) - reinterpret_cast(this);
+ ptrdiff_t offset = ((diff + int(compact_alignment - 1)) >> compact_alignment_log2) + 65533;
+
+ if (static_cast(offset) <= 65533)
+ {
+ _data = static_cast(offset + 1);
+ }
+ else
+ {
+ xml_memory_page* page = compact_get_page(this, header_offset);
+
+ if (PUGI__UNLIKELY(page->compact_shared_parent == 0))
+ page->compact_shared_parent = value;
+
+ if (page->compact_shared_parent == value)
+ {
+ _data = 65534;
+ }
+ else
+ {
+ compact_set_value(this, value);
+
+ _data = 65535;
+ }
+ }
+ }
+ else
+ {
+ _data = 0;
+ }
+ }
+
+ operator T*() const
+ {
+ if (_data)
+ {
+ if (_data < 65534)
+ {
+ uintptr_t base = reinterpret_cast(this) & ~(compact_alignment - 1);
+
+ return reinterpret_cast(base + ((_data - 1 - 65533) << compact_alignment_log2));
+ }
+ else if (_data == 65534)
+ return static_cast(compact_get_page(this, header_offset)->compact_shared_parent);
+ else
+ return compact_get_value(this);
+ }
+ else
+ return 0;
+ }
+
+ T* operator->() const
+ {
+ return *this;
+ }
+
+ private:
+ uint16_t _data;
+ };
+
+ template class compact_string
+ {
+ public:
+ compact_string(): _data(0)
+ {
+ }
+
+ void operator=(const compact_string& rhs)
+ {
+ *this = rhs + 0;
+ }
+
+ void operator=(char_t* value)
+ {
+ if (value)
+ {
+ xml_memory_page* page = compact_get_page(this, header_offset);
+
+ if (PUGI__UNLIKELY(page->compact_string_base == 0))
+ page->compact_string_base = value;
+
+ ptrdiff_t offset = value - page->compact_string_base;
+
+ if (static_cast(offset) < (65535 << 7))
+ {
+ // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+ uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset));
+
+ if (*base == 0)
+ {
+ *base = static_cast((offset >> 7) + 1);
+ _data = static_cast((offset & 127) + 1);
+ }
+ else
+ {
+ ptrdiff_t remainder = offset - ((*base - 1) << 7);
+
+ if (static_cast(remainder) <= 253)
+ {
+ _data = static_cast(remainder + 1);
+ }
+ else
+ {
+ compact_set_value(this, value);
+
+ _data = 255;
+ }
+ }
+ }
+ else
+ {
+ compact_set_value(this, value);
+
+ _data = 255;
+ }
+ }
+ else
+ {
+ _data = 0;
+ }
+ }
+
+ operator char_t*() const
+ {
+ if (_data)
+ {
+ if (_data < 255)
+ {
+ xml_memory_page* page = compact_get_page(this, header_offset);
+
+ // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+ const uint16_t* base = reinterpret_cast(static_cast(reinterpret_cast(this) - base_offset));
+ assert(*base);
+
+ ptrdiff_t offset = ((*base - 1) << 7) + (_data - 1);
+
+ return page->compact_string_base + offset;
+ }
+ else
+ {
+ return compact_get_value(this);
+ }
+ }
+ else
+ return 0;
+ }
+
+ private:
+ unsigned char _data;
+ };
+PUGI__NS_END
+#endif
+
+#ifdef PUGIXML_COMPACT
+namespace pugi
+{
+ struct xml_attribute_struct
+ {
+ xml_attribute_struct(impl::xml_memory_page* page): header(page, 0), namevalue_base(0)
+ {
+ PUGI__STATIC_ASSERT(sizeof(xml_attribute_struct) == 8);
+ }
+
+ impl::compact_header header;
+
+ uint16_t namevalue_base;
- if (!page) return 0;
+ impl::compact_string<4, 2> name;
+ impl::compact_string<5, 3> value;
- if (size <= large_allocation_threshold)
+ impl::compact_pointer prev_attribute_c;
+ impl::compact_pointer next_attribute;
+ };
+
+ struct xml_node_struct
+ {
+ xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(page, type), namevalue_base(0)
{
- _root->busy_size = _busy_size;
+ PUGI__STATIC_ASSERT(sizeof(xml_node_struct) == 12);
+ }
- // insert page at the end of linked list
- page->prev = _root;
- _root->next = page;
- _root = page;
+ impl::compact_header header;
- _busy_size = size;
- }
- else
- {
- // insert page before the end of linked list, so that it is deleted as soon as possible
- // the last page is not deleted even if it's empty (see deallocate_memory)
- assert(_root->prev);
+ uint16_t namevalue_base;
- page->prev = _root->prev;
- page->next = _root;
+ impl::compact_string<4, 2> name;
+ impl::compact_string<5, 3> value;
- _root->prev->next = page;
- _root->prev = page;
- }
+ impl::compact_pointer_parent parent;
- // allocate inside page
- page->busy_size = size;
+ impl::compact_pointer first_child;
- return reinterpret_cast(page) + sizeof(xml_memory_page);
- }
-PUGI__NS_END
+ impl::compact_pointer prev_sibling_c;
+ impl::compact_pointer next_sibling;
+ impl::compact_pointer first_attribute;
+ };
+}
+#else
namespace pugi
{
- /// A 'name=value' XML attribute structure.
struct xml_attribute_struct
{
- /// Default ctor
- xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+ xml_attribute_struct(impl::xml_memory_page* page): name(0), value(0), prev_attribute_c(0), next_attribute(0)
{
+ header = PUGI__GETHEADER_IMPL(this, page, 0);
}
uintptr_t header;
- char_t* name; ///< Pointer to attribute name.
- char_t* value; ///< Pointer to attribute value.
+ char_t* name;
+ char_t* value;
- xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
- xml_attribute_struct* next_attribute; ///< Next attribute
+ xml_attribute_struct* prev_attribute_c;
+ xml_attribute_struct* next_attribute;
};
- /// An XML document tree node.
struct xml_node_struct
{
- /// Default ctor
- /// \param type - node type
- xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+ xml_node_struct(impl::xml_memory_page* page, xml_node_type type): name(0), value(0), parent(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
{
+ header = PUGI__GETHEADER_IMPL(this, page, type);
}
uintptr_t header;
- xml_node_struct* parent; ///< Pointer to parent
+ char_t* name;
+ char_t* value;
+
+ xml_node_struct* parent;
- char_t* name; ///< Pointer to element name.
- char_t* value; ///< Pointer to any associated string data.
+ xml_node_struct* first_child;
- xml_node_struct* first_child; ///< First child
-
- xml_node_struct* prev_sibling_c; ///< Left brother (cyclic list)
- xml_node_struct* next_sibling; ///< Right brother
-
- xml_attribute_struct* first_attribute; ///< First attribute
+ xml_node_struct* prev_sibling_c;
+ xml_node_struct* next_sibling;
+
+ xml_attribute_struct* first_attribute;
};
}
+#endif
PUGI__NS_BEGIN
struct xml_extra_buffer
@@ -561,20 +1125,24 @@ PUGI__NS_BEGIN
const char_t* buffer;
xml_extra_buffer* extra_buffers;
+
+ #ifdef PUGIXML_COMPACT
+ compact_hash_table hash;
+ #endif
};
- inline xml_allocator& get_allocator(const xml_node_struct* node)
+ template inline xml_allocator& get_allocator(const Object* object)
{
- assert(node);
+ assert(object);
- return *reinterpret_cast(node->header & xml_memory_page_pointer_mask)->allocator;
+ return *PUGI__GETPAGE(object)->allocator;
}
template inline xml_document_struct& get_document(const Object* object)
{
assert(object);
- return *static_cast(reinterpret_cast(object->header & xml_memory_page_pointer_mask)->allocator);
+ return *static_cast(PUGI__GETPAGE(object)->allocator);
}
PUGI__NS_END
@@ -583,7 +1151,8 @@ PUGI__NS_BEGIN
inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
{
xml_memory_page* page;
- void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+ void* memory = alloc.allocate_object(sizeof(xml_attribute_struct), page);
+ if (!memory) return 0;
return new (memory) xml_attribute_struct(page);
}
@@ -591,27 +1160,30 @@ PUGI__NS_BEGIN
inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
{
xml_memory_page* page;
- void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+ void* memory = alloc.allocate_object(sizeof(xml_node_struct), page);
+ if (!memory) return 0;
return new (memory) xml_node_struct(page, type);
}
inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
{
- uintptr_t header = a->header;
+ if (a->header & impl::xml_memory_page_name_allocated_mask)
+ alloc.deallocate_string(a->name);
- if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
- if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+ if (a->header & impl::xml_memory_page_value_allocated_mask)
+ alloc.deallocate_string(a->value);
- alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast(header & xml_memory_page_pointer_mask));
+ alloc.deallocate_memory(a, sizeof(xml_attribute_struct), PUGI__GETPAGE(a));
}
inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
{
- uintptr_t header = n->header;
+ if (n->header & impl::xml_memory_page_name_allocated_mask)
+ alloc.deallocate_string(n->name);
- if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
- if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+ if (n->header & impl::xml_memory_page_value_allocated_mask)
+ alloc.deallocate_string(n->value);
for (xml_attribute_struct* attr = n->first_attribute; attr; )
{
@@ -631,7 +1203,7 @@ PUGI__NS_BEGIN
child = next;
}
- alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast(header & xml_memory_page_pointer_mask));
+ alloc.deallocate_memory(n, sizeof(xml_node_struct), PUGI__GETPAGE(n));
}
inline void append_node(xml_node_struct* child, xml_node_struct* node)
@@ -803,6 +1375,8 @@ PUGI__NS_BEGIN
PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
{
+ if (!alloc.reserve()) return 0;
+
xml_node_struct* child = allocate_node(alloc, type);
if (!child) return 0;
@@ -813,6 +1387,8 @@ PUGI__NS_BEGIN
PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc)
{
+ if (!alloc.reserve()) return 0;
+
xml_attribute_struct* attr = allocate_attribute(alloc);
if (!attr) return 0;
@@ -1018,28 +1594,11 @@ PUGI__NS_BEGIN
}
};
- template struct wchar_selector;
-
- template <> struct wchar_selector<2>
- {
- typedef uint16_t type;
- typedef utf16_counter counter;
- typedef utf16_writer writer;
- };
-
- template <> struct wchar_selector<4>
+ struct utf8_decoder
{
- typedef uint32_t type;
- typedef utf32_counter counter;
- typedef utf32_writer writer;
- };
-
- typedef wchar_selector::counter wchar_counter;
- typedef wchar_selector::writer wchar_writer;
+ typedef uint8_t type;
- template struct utf_decoder
- {
- static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+ template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits)
{
const uint8_t utf8_byte_mask = 0x3f;
@@ -1100,29 +1659,34 @@ PUGI__NS_BEGIN
return result;
}
+ };
- static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
- {
- const uint16_t* end = data + size;
+ template struct utf16_decoder
+ {
+ typedef uint16_t type;
- while (data < end)
+ template static inline typename Traits::value_type process(const uint16_t* data, size_t size, typename Traits::value_type result, Traits)
+ {
+ while (size)
{
- unsigned int lead = opt_swap::value ? endian_swap(*data) : *data;
+ uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
// U+0000..U+D7FF
if (lead < 0xD800)
{
result = Traits::low(result, lead);
data += 1;
+ size -= 1;
}
// U+E000..U+FFFF
else if (static_cast(lead - 0xE000) < 0x2000)
{
result = Traits::low(result, lead);
data += 1;
+ size -= 1;
}
// surrogate pair lead
- else if (static_cast(lead - 0xD800) < 0x400 && data + 1 < end)
+ else if (static_cast(lead - 0xD800) < 0x400 && size >= 2)
{
uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
@@ -1130,26 +1694,32 @@ PUGI__NS_BEGIN
{
result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
data += 2;
+ size -= 2;
}
else
{
data += 1;
+ size -= 1;
}
}
else
{
data += 1;
+ size -= 1;
}
}
return result;
}
+ };
- static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
- {
- const uint32_t* end = data + size;
+ template struct utf32_decoder
+ {
+ typedef uint32_t type;
- while (data < end)
+ template static inline typename Traits::value_type process(const uint32_t* data, size_t size, typename Traits::value_type result, Traits)
+ {
+ while (size)
{
uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
@@ -1158,53 +1728,76 @@ PUGI__NS_BEGIN
{
result = Traits::low(result, lead);
data += 1;
+ size -= 1;
}
// U+10000..U+10FFFF
else
{
result = Traits::high(result, lead);
data += 1;
+ size -= 1;
}
}
return result;
}
+ };
+
+ struct latin1_decoder
+ {
+ typedef uint8_t type;
- static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+ template static inline typename Traits::value_type process(const uint8_t* data, size_t size, typename Traits::value_type result, Traits)
{
- for (size_t i = 0; i < size; ++i)
+ while (size)
{
- result = Traits::low(result, data[i]);
+ result = Traits::low(result, *data);
+ data += 1;
+ size -= 1;
}
return result;
}
+ };
- static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
- {
- return decode_utf16_block(data, size, result);
- }
+ template struct wchar_selector;
- static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
- {
- return decode_utf32_block(data, size, result);
- }
+ template <> struct wchar_selector<2>
+ {
+ typedef uint16_t type;
+ typedef utf16_counter counter;
+ typedef utf16_writer writer;
+ typedef utf16_decoder decoder;
+ };
- static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
- {
- return decode_wchar_block_impl(reinterpret_cast::type*>(data), size, result);
- }
+ template <> struct wchar_selector<4>
+ {
+ typedef uint32_t type;
+ typedef utf32_counter counter;
+ typedef utf32_writer writer;
+ typedef utf32_decoder decoder;
};
- template PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+ typedef wchar_selector::counter wchar_counter;
+ typedef wchar_selector::writer wchar_writer;
+
+ struct wchar_decoder
{
- for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
- }
+ typedef wchar_t type;
+
+ template static inline typename Traits::value_type process(const wchar_t* data, size_t size, typename Traits::value_type result, Traits traits)
+ {
+ typedef wchar_selector::decoder decoder;
+
+ return decoder::process(reinterpret_cast(data), size, result, traits);
+ }
+ };
#ifdef PUGIXML_WCHAR_MODE
PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
{
- for (size_t i = 0; i < length; ++i) result[i] = static_cast(endian_swap(static_cast::type>(data[i])));
+ for (size_t i = 0; i < length; ++i)
+ result[i] = static_cast(endian_swap(static_cast::type>(data[i])));
}
#endif
PUGI__NS_END
@@ -1222,7 +1815,6 @@ PUGI__NS_BEGIN
ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, :
};
-#if !defined(OMR_EBCDIC)
static const unsigned char chartype_table[256] =
{
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
@@ -1243,27 +1835,6 @@ PUGI__NS_BEGIN
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192
};
-#else /* !defined(OMR_EBCDIC) */
- static const unsigned char chartype_table[256] =
- {
- 55, 0, 0, 0, 192, 12, 192, 0, 192, 192, 192, 0, 0, 63, 0, 0,
- 0, 0, 0, 0, 192, 12, 0, 192, 0, 0, 192, 192, 0, 0, 0, 0,
- 192, 192, 192, 192, 192, 192, 0, 0, 192, 192, 192, 192, 192, 0, 0, 0,
- 192, 192, 0, 192, 192, 192, 192, 0, 192, 192, 192, 192, 0, 0, 192, 0,
- 8, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 64, 1, 0, 0, 0,
- 7, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, 0,
- 96, 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 192, 48, 0,
- 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 192, 0, 0, 6, 0, 6,
- 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
- 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
- 192, 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 192, 192,
- 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 16, 192, 192,
- 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
- 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
- 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
- 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 192, 192, 192, 192, 192
- };
-#endif /* !defined(OMR_EBCDIC) */
enum chartypex_t
{
@@ -1273,8 +1844,7 @@ PUGI__NS_BEGIN
ctx_digit = 8, // 0-9
ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
};
-
-#if !defined(OMR_EBCDIC)
+
static const unsigned char chartypex_table[256] =
{
3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
@@ -1296,28 +1866,6 @@ PUGI__NS_BEGIN
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
};
-
-#else /* !defined(OMR_EBCDIC) */
- static const unsigned char chartypex_table[256] =
- {
- 3, 3, 3, 3, 20, 0, 20, 0, 20, 20, 20, 3, 3, 2, 3, 3,
- 3, 3, 3, 3, 20, 2, 3, 20, 3, 3, 20, 20, 3, 3, 3, 3,
- 20, 20, 20, 20, 20, 20, 3, 3, 20, 20, 20, 20, 20, 3, 3, 3,
- 20, 20, 3, 20, 20, 20, 20, 3, 20, 20, 20, 20, 3, 3, 20, 3,
- 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 16, 3, 0, 0, 0,
- 3, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0,
- 16, 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 20, 3, 0,
- 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 2,
- 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
- 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
- 20, 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 20, 20,
- 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 20, 20,
- 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
- 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
- 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
- 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 20, 20, 20, 20, 20, 20
- };
-#endif /* !defined(OMR_EBCDIC) */
#ifdef PUGIXML_WCHAR_MODE
#define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast(c) < 128 ? table[static_cast(c)] : table[128]) & (ct))
@@ -1341,12 +1889,71 @@ PUGI__NS_BEGIN
if (sizeof(wchar_t) == 2)
return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
- else
+ else
return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
}
- PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+ PUGI__FN bool parse_declaration_encoding(const uint8_t* data, size_t size, const uint8_t*& out_encoding, size_t& out_length)
+ {
+ #define PUGI__SCANCHAR(ch) { if (offset >= size || data[offset] != ch) return false; offset++; }
+ #define PUGI__SCANCHARTYPE(ct) { while (offset < size && PUGI__IS_CHARTYPE(data[offset], ct)) offset++; }
+
+ // check if we have a non-empty XML declaration
+ if (size < 6 || !((data[0] == '<') & (data[1] == '?') & (data[2] == 'x') & (data[3] == 'm') & (data[4] == 'l') && PUGI__IS_CHARTYPE(data[5], ct_space)))
+ return false;
+
+ // scan XML declaration until the encoding field
+ for (size_t i = 6; i + 1 < size; ++i)
+ {
+ // declaration can not contain ? in quoted values
+ if (data[i] == '?')
+ return false;
+
+ if (data[i] == 'e' && data[i + 1] == 'n')
+ {
+ size_t offset = i;
+
+ // encoding follows the version field which can't contain 'en' so this has to be the encoding if XML is well formed
+ PUGI__SCANCHAR('e'); PUGI__SCANCHAR('n'); PUGI__SCANCHAR('c'); PUGI__SCANCHAR('o');
+ PUGI__SCANCHAR('d'); PUGI__SCANCHAR('i'); PUGI__SCANCHAR('n'); PUGI__SCANCHAR('g');
+
+ // S? = S?
+ PUGI__SCANCHARTYPE(ct_space);
+ PUGI__SCANCHAR('=');
+ PUGI__SCANCHARTYPE(ct_space);
+
+ // the only two valid delimiters are ' and "
+ uint8_t delimiter = (offset < size && data[offset] == '"') ? '"' : '\'';
+
+ PUGI__SCANCHAR(delimiter);
+
+ size_t start = offset;
+
+ out_encoding = data + offset;
+
+ PUGI__SCANCHARTYPE(ct_symbol);
+
+ out_length = offset - start;
+
+ PUGI__SCANCHAR(delimiter);
+
+ return true;
+ }
+ }
+
+ return false;
+
+ #undef PUGI__SCANCHAR
+ #undef PUGI__SCANCHARTYPE
+ }
+
+ PUGI__FN xml_encoding guess_buffer_encoding(const uint8_t* data, size_t size)
{
+ // skip encoding autodetection if input buffer is too small
+ if (size < 4) return encoding_utf8;
+
+ uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
// look for BOM in first few bytes
if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
@@ -1359,13 +1966,32 @@ PUGI__NS_BEGIN
if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
- if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
// look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
- // no known BOM detected, assume utf8
+ // no known BOM detected; parse declaration
+ const uint8_t* enc = 0;
+ size_t enc_length = 0;
+
+ if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d && parse_declaration_encoding(data, size, enc, enc_length))
+ {
+ // iso-8859-1 (case-insensitive)
+ if (enc_length == 10
+ && (enc[0] | ' ') == 'i' && (enc[1] | ' ') == 's' && (enc[2] | ' ') == 'o'
+ && enc[3] == '-' && enc[4] == '8' && enc[5] == '8' && enc[6] == '5' && enc[7] == '9'
+ && enc[8] == '-' && enc[9] == '1')
+ return encoding_latin1;
+
+ // latin1 (case-insensitive)
+ if (enc_length == 6
+ && (enc[0] | ' ') == 'l' && (enc[1] | ' ') == 'a' && (enc[2] | ' ') == 't'
+ && (enc[3] | ' ') == 'i' && (enc[4] | ' ') == 'n'
+ && enc[5] == '1')
+ return encoding_latin1;
+ }
+
return encoding_utf8;
}
@@ -1383,15 +2009,10 @@ PUGI__NS_BEGIN
// only do autodetection if no explicit encoding is requested
if (encoding != encoding_auto) return encoding;
- // skip encoding autodetection if input buffer is too small
- if (size < 4) return encoding_utf8;
-
// try to guess encoding (based on XML specification, Appendix F.1)
const uint8_t* data = static_cast(contents);
- PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
-
- return guess_buffer_encoding(d0, d1, d2, d3);
+ return guess_buffer_encoding(data, size);
}
PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
@@ -1458,38 +2079,13 @@ PUGI__NS_BEGIN
return true;
}
- PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
- {
- const uint8_t* data = static_cast(contents);
- size_t data_length = size;
-
- // first pass: get length in wchar_t units
- size_t length = utf_decoder::decode_utf8_block(data, data_length, 0);
-
- // allocate buffer of suitable length
- char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
- if (!buffer) return false;
-
- // second pass: convert utf8 input to wchar_t
- wchar_writer::value_type obegin = reinterpret_cast(buffer);
- wchar_writer::value_type oend = utf_decoder::decode_utf8_block(data, data_length, obegin);
-
- assert(oend == obegin + length);
- *oend = 0;
-
- out_buffer = buffer;
- out_length = length + 1;
-
- return true;
- }
-
- template PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D)
{
- const uint16_t* data = static_cast(contents);
- size_t data_length = size / sizeof(uint16_t);
+ const typename D::type* data = static_cast(contents);
+ size_t data_length = size / sizeof(typename D::type);
// first pass: get length in wchar_t units
- size_t length = utf_decoder::decode_utf16_block(data, data_length, 0);
+ size_t length = D::process(data, data_length, 0, wchar_counter());
// allocate buffer of suitable length
char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
@@ -1497,57 +2093,7 @@ PUGI__NS_BEGIN
// second pass: convert utf16 input to wchar_t
wchar_writer::value_type obegin = reinterpret_cast(buffer);
- wchar_writer::value_type oend = utf_decoder::decode_utf16_block(data, data_length, obegin);
-
- assert(oend == obegin + length);
- *oend = 0;
-
- out_buffer = buffer;
- out_length = length + 1;
-
- return true;
- }
-
- template PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
- {
- const uint32_t* data = static_cast(contents);
- size_t data_length = size / sizeof(uint32_t);
-
- // first pass: get length in wchar_t units
- size_t length = utf_decoder::decode_utf32_block(data, data_length, 0);
-
- // allocate buffer of suitable length
- char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
- if (!buffer) return false;
-
- // second pass: convert utf32 input to wchar_t
- wchar_writer::value_type obegin = reinterpret_cast(buffer);
- wchar_writer::value_type oend = utf_decoder::decode_utf32_block(data, data_length, obegin);
-
- assert(oend == obegin + length);
- *oend = 0;
-
- out_buffer = buffer;
- out_length = length + 1;
-
- return true;
- }
-
- PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
- {
- const uint8_t* data = static_cast(contents);
- size_t data_length = size;
-
- // get length in wchar_t units
- size_t length = data_length;
-
- // allocate buffer of suitable length
- char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
- if (!buffer) return false;
-
- // convert latin1 input to wchar_t
- wchar_writer::value_type obegin = reinterpret_cast(buffer);
- wchar_writer::value_type oend = utf_decoder::decode_latin1_block(data, data_length, obegin);
+ wchar_writer::value_type oend = D::process(data, data_length, obegin, wchar_writer());
assert(oend == obegin + length);
*oend = 0;
@@ -1564,13 +2110,16 @@ PUGI__NS_BEGIN
xml_encoding wchar_encoding = get_wchar_encoding();
// fast path: no conversion required
- if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+ if (encoding == wchar_encoding)
+ return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// only endian-swapping is required
- if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+ if (need_endian_swap_utf(encoding, wchar_encoding))
+ return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
// source encoding is utf8
- if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+ if (encoding == encoding_utf8)
+ return convert_buffer_generic(out_buffer, out_length, contents, size, utf8_decoder());
// source encoding is utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
@@ -1578,8 +2127,8 @@ PUGI__NS_BEGIN
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return (native_encoding == encoding) ?
- convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
- convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) :
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder());
}
// source encoding is utf32
@@ -1588,24 +2137,25 @@ PUGI__NS_BEGIN
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return (native_encoding == encoding) ?
- convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
- convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) :
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder());
}
// source encoding is latin1
- if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+ if (encoding == encoding_latin1)
+ return convert_buffer_generic(out_buffer, out_length, contents, size, latin1_decoder());
- assert(!"Invalid encoding");
+ assert(false && "Invalid encoding");
return false;
}
#else
- template PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+ template PUGI__FN bool convert_buffer_generic(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, D)
{
- const uint16_t* data = static_cast(contents);
- size_t data_length = size / sizeof(uint16_t);
+ const typename D::type* data = static_cast(contents);
+ size_t data_length = size / sizeof(typename D::type);
// first pass: get length in utf8 units
- size_t length = utf_decoder::decode_utf16_block(data, data_length, 0);
+ size_t length = D::process(data, data_length, 0, utf8_counter());
// allocate buffer of suitable length
char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
@@ -1613,32 +2163,7 @@ PUGI__NS_BEGIN
// second pass: convert utf16 input to utf8
uint8_t* obegin = reinterpret_cast(buffer);
- uint8_t* oend = utf_decoder::decode_utf16_block(data, data_length, obegin);
-
- assert(oend == obegin + length);
- *oend = 0;
-
- out_buffer = buffer;
- out_length = length + 1;
-
- return true;
- }
-
- template PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
- {
- const uint32_t* data = static_cast(contents);
- size_t data_length = size / sizeof(uint32_t);
-
- // first pass: get length in utf8 units
- size_t length = utf_decoder::decode_utf32_block(data, data_length, 0);
-
- // allocate buffer of suitable length
- char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
- if (!buffer) return false;
-
- // second pass: convert utf32 input to utf8
- uint8_t* obegin = reinterpret_cast(buffer);
- uint8_t* oend = utf_decoder::decode_utf32_block(data, data_length, obegin);
+ uint8_t* oend = D::process(data, data_length, obegin, utf8_writer());
assert(oend == obegin + length);
*oend = 0;
@@ -1674,7 +2199,7 @@ PUGI__NS_BEGIN
if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// first pass: get length in utf8 units
- size_t length = prefix_length + utf_decoder::decode_latin1_block(postfix, postfix_length, 0);
+ size_t length = prefix_length + latin1_decoder::process(postfix, postfix_length, 0, utf8_counter());
// allocate buffer of suitable length
char_t* buffer = static_cast(xml_memory::allocate((length + 1) * sizeof(char_t)));
@@ -1684,7 +2209,7 @@ PUGI__NS_BEGIN
memcpy(buffer, data, prefix_length);
uint8_t* obegin = reinterpret_cast(buffer);
- uint8_t* oend = utf_decoder::decode_latin1_block(postfix, postfix_length, obegin + prefix_length);
+ uint8_t* oend = latin1_decoder::process(postfix, postfix_length, obegin + prefix_length, utf8_writer());
assert(oend == obegin + length);
*oend = 0;
@@ -1698,7 +2223,8 @@ PUGI__NS_BEGIN
PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
{
// fast path: no conversion required
- if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+ if (encoding == encoding_utf8)
+ return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
// source encoding is utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
@@ -1706,8 +2232,8 @@ PUGI__NS_BEGIN
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
return (native_encoding == encoding) ?
- convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
- convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder()) :
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf16_decoder());
}
// source encoding is utf32
@@ -1716,14 +2242,15 @@ PUGI__NS_BEGIN
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
return (native_encoding == encoding) ?
- convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
- convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder()) :
+ convert_buffer_generic(out_buffer, out_length, contents, size, utf32_decoder());
}
// source encoding is latin1
- if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+ if (encoding == encoding_latin1)
+ return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
- assert(!"Invalid encoding");
+ assert(false && "Invalid encoding");
return false;
}
#endif
@@ -1731,22 +2258,20 @@ PUGI__NS_BEGIN
PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
{
// get length in utf8 characters
- return utf_decoder::decode_wchar_block(str, length, 0);
+ return wchar_decoder::process(str, length, 0, utf8_counter());
}
PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
{
// convert to utf8
uint8_t* begin = reinterpret_cast(buffer);
- uint8_t* end = utf_decoder::decode_wchar_block(str, length, begin);
-
+ uint8_t* end = wchar_decoder::process(str, length, begin, utf8_writer());
+
assert(begin + size == end);
(void)!end;
-
- // zero-terminate
- buffer[size] = 0;
+ (void)!size;
}
-
+
#ifndef PUGIXML_NO_STL
PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
{
@@ -1768,7 +2293,7 @@ PUGI__NS_BEGIN
const uint8_t* data = reinterpret_cast(str);
// first pass: get length in wchar_t units
- size_t length = utf_decoder::decode_utf8_block(data, size, 0);
+ size_t length = utf8_decoder::process(data, size, 0, wchar_counter());
// allocate resulting string
std::basic_string result;
@@ -1778,7 +2303,7 @@ PUGI__NS_BEGIN
if (length > 0)
{
wchar_writer::value_type begin = reinterpret_cast(&result[0]);
- wchar_writer::value_type end = utf_decoder::decode_utf8_block(data, size, begin);
+ wchar_writer::value_type end = utf8_decoder::process(data, size, begin, wchar_writer());
assert(begin + length == end);
(void)!end;
@@ -1788,7 +2313,8 @@ PUGI__NS_BEGIN
}
#endif
- inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target)
+ template
+ inline bool strcpy_insitu_allow(size_t length, const Header& header, uintptr_t header_mask, char_t* target)
{
// never reuse shared memory
if (header & xml_memory_page_contents_shared_mask) return false;
@@ -1804,19 +2330,16 @@ PUGI__NS_BEGIN
return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
}
- PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+ template
+ PUGI__FN bool strcpy_insitu(String& dest, Header& header, uintptr_t header_mask, const char_t* source, size_t source_length)
{
- assert(header);
-
- size_t source_length = strlength(source);
-
if (source_length == 0)
{
// empty string and null pointer are equivalent, so just deallocate old memory
- xml_allocator* alloc = reinterpret_cast(header & xml_memory_page_pointer_mask)->allocator;
+ xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator;
if (header & header_mask) alloc->deallocate_string(dest);
-
+
// mark the string as not allocated
dest = 0;
header &= ~header_mask;
@@ -1826,24 +2349,28 @@ PUGI__NS_BEGIN
else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
{
// we can reuse old buffer, so just copy the new data (including zero terminator)
- memcpy(dest, source, (source_length + 1) * sizeof(char_t));
-
+ memcpy(dest, source, source_length * sizeof(char_t));
+ dest[source_length] = 0;
+
return true;
}
else
{
- xml_allocator* alloc = reinterpret_cast(header & xml_memory_page_pointer_mask)->allocator;
+ xml_allocator* alloc = PUGI__GETPAGE_IMPL(header)->allocator;
+
+ if (!alloc->reserve()) return false;
// allocate new buffer
char_t* buf = alloc->allocate_string(source_length + 1);
if (!buf) return false;
// copy the string (including zero terminator)
- memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+ memcpy(buf, source, source_length * sizeof(char_t));
+ buf[source_length] = 0;
// deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
if (header & header_mask) alloc->deallocate_string(dest);
-
+
// the string is now allocated, so set the flag
dest = buf;
header |= header_mask;
@@ -1856,11 +2383,11 @@ PUGI__NS_BEGIN
{
char_t* end;
size_t size;
-
+
gap(): end(0), size(0)
{
}
-
+
// Push new gap, move s count bytes further (skipping the gap).
// Collapse previous gap.
void push(char_t*& s, size_t count)
@@ -1871,14 +2398,14 @@ PUGI__NS_BEGIN
assert(s >= end);
memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end));
}
-
+
s += count; // end of current gap
-
+
// "merge" two gaps
end = s;
size += count;
}
-
+
// Collapse all gaps, return past-the-end pointer
char_t* flush(char_t* s)
{
@@ -1893,7 +2420,7 @@ PUGI__NS_BEGIN
else return s;
}
};
-
+
PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
{
char_t* stre = s + 1;
@@ -1925,7 +2452,7 @@ PUGI__NS_BEGIN
ch = *++stre;
}
-
+
++stre;
}
else // ... (dec code)
@@ -1945,7 +2472,7 @@ PUGI__NS_BEGIN
ch = *++stre;
}
-
+
++stre;
}
@@ -1954,7 +2481,7 @@ PUGI__NS_BEGIN
#else
s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc));
#endif
-
+
g.push(s, stre - s);
return stre;
}
@@ -1969,7 +2496,7 @@ PUGI__NS_BEGIN
{
*s++ = '&';
++stre;
-
+
g.push(s, stre - s);
return stre;
}
@@ -1994,7 +2521,7 @@ PUGI__NS_BEGIN
{
*s++ = '>';
++stre;
-
+
g.push(s, stre - s);
return stre;
}
@@ -2007,7 +2534,7 @@ PUGI__NS_BEGIN
{
*s++ = '<';
++stre;
-
+
g.push(s, stre - s);
return stre;
}
@@ -2020,7 +2547,7 @@ PUGI__NS_BEGIN
{
*s++ = '"';
++stre;
-
+
g.push(s, stre - s);
return stre;
}
@@ -2030,7 +2557,7 @@ PUGI__NS_BEGIN
default:
break;
}
-
+
return stre;
}
@@ -2038,7 +2565,7 @@ PUGI__NS_BEGIN
#define PUGI__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
#define PUGI__SKIPWS() { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
#define PUGI__OPTSET(OPT) ( optmsk & (OPT) )
- #define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+ #define PUGI__PUSHNODE(TYPE) { cursor = append_new_node(cursor, *alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
#define PUGI__POPNODE() { cursor = cursor->parent; }
#define PUGI__SCANFOR(X) { while (*s != 0 && !(X)) ++s; }
#define PUGI__SCANWHILE(X) { while (X) ++s; }
@@ -2050,21 +2577,21 @@ PUGI__NS_BEGIN
PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
{
gap g;
-
+
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment));
-
+
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
-
+
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
{
*g.flush(s) = 0;
-
+
return s + (s[2] == '>' ? 3 : 2);
}
else if (*s == 0)
@@ -2078,21 +2605,21 @@ PUGI__NS_BEGIN
PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
{
gap g;
-
+
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
-
+
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
-
+
if (*s == '\n') g.push(s, 1);
}
else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
{
*g.flush(s) = 0;
-
+
return s + 1;
}
else if (*s == 0)
@@ -2102,9 +2629,9 @@ PUGI__NS_BEGIN
else ++s;
}
}
-
+
typedef char_t* (*strconv_pcdata_t)(char_t*);
-
+
template struct strconv_pcdata_impl
{
static char_t* parse(char_t* s)
@@ -2126,13 +2653,13 @@ PUGI__NS_BEGIN
--end;
*end = 0;
-
+
return s + 1;
}
else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
-
+
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
@@ -2155,7 +2682,7 @@ PUGI__NS_BEGIN
}
}
};
-
+
PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
{
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
@@ -2175,7 +2702,7 @@ PUGI__NS_BEGIN
}
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
-
+
template struct strconv_attribute_impl
{
static char_t* parse_wnorm(char_t* s, char_t end_quote)
@@ -2186,35 +2713,35 @@ PUGI__NS_BEGIN
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s;
-
+
do ++str;
while (PUGI__IS_CHARTYPE(*str, ct_space));
-
+
g.push(s, str - s);
}
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
-
+
if (*s == end_quote)
{
char_t* str = g.flush(s);
-
+
do *str-- = 0;
while (PUGI__IS_CHARTYPE(*str, ct_space));
-
+
return s + 1;
}
else if (PUGI__IS_CHARTYPE(*s, ct_space))
{
*s++ = ' ';
-
+
if (PUGI__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s + 1;
while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
-
+
g.push(s, str - s);
}
}
@@ -2237,11 +2764,11 @@ PUGI__NS_BEGIN
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
-
+
if (*s == end_quote)
{
*g.flush(s) = 0;
-
+
return s + 1;
}
else if (PUGI__IS_CHARTYPE(*s, ct_space))
@@ -2249,7 +2776,7 @@ PUGI__NS_BEGIN
if (*s == '\r')
{
*s++ = ' ';
-
+
if (*s == '\n') g.push(s, 1);
}
else *s++ = ' ';
@@ -2273,17 +2800,17 @@ PUGI__NS_BEGIN
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
-
+
if (*s == end_quote)
{
*g.flush(s) = 0;
-
+
return s + 1;
}
else if (*s == '\r')
{
*s++ = '\n';
-
+
if (*s == '\n') g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
@@ -2305,11 +2832,11 @@ PUGI__NS_BEGIN
while (true)
{
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
-
+
if (*s == end_quote)
{
*g.flush(s) = 0;
-
+
return s + 1;
}
else if (opt_escape::value && *s == '&')
@@ -2328,7 +2855,7 @@ PUGI__NS_BEGIN
PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
{
PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
-
+
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
{
case 0: return strconv_attribute_impl::parse_simple;
@@ -2362,11 +2889,11 @@ PUGI__NS_BEGIN
struct xml_parser
{
- xml_allocator alloc;
+ xml_allocator* alloc;
char_t* error_offset;
xml_parse_status error_status;
-
- xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+
+ xml_parser(xml_allocator* alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
{
}
@@ -2403,7 +2930,7 @@ PUGI__NS_BEGIN
PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
- s += 4;
+ s += 3;
}
else PUGI__THROW_ERROR(status_bad_doctype, s);
@@ -2412,23 +2939,28 @@ PUGI__NS_BEGIN
char_t* parse_doctype_ignore(char_t* s)
{
+ size_t depth = 0;
+
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
- s++;
+ s += 3;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{
// nested ignore section
- s = parse_doctype_ignore(s);
- if (!s) return s;
+ s += 3;
+ depth++;
}
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{
// ignore section end
s += 3;
- return s;
+ if (depth == 0)
+ return s;
+
+ depth--;
}
else s++;
}
@@ -2436,10 +2968,12 @@ PUGI__NS_BEGIN
PUGI__THROW_ERROR(status_bad_doctype, s);
}
- char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
+ char_t* parse_doctype_group(char_t* s, char_t endch)
{
+ size_t depth = 0;
+
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
- s++;
+ s += 2;
while (*s)
{
@@ -2454,12 +2988,8 @@ PUGI__NS_BEGIN
else
{
// some control group
- s = parse_doctype_group(s, endch, false);
- if (!s) return s;
-
- // skip >
- assert(*s == '>');
- s++;
+ s += 2;
+ depth++;
}
}
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
@@ -2470,12 +3000,16 @@ PUGI__NS_BEGIN
}
else if (*s == '>')
{
- return s;
+ if (depth == 0)
+ return s;
+
+ depth--;
+ s++;
}
else s++;
}
- if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+ if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
return s;
}
@@ -2567,7 +3101,7 @@ PUGI__NS_BEGIN
char_t* mark = s + 9;
- s = parse_doctype_group(s, endch, true);
+ s = parse_doctype_group(s, endch);
if (!s) return s;
assert((*s == 0 && endch == '>') || *s == '>');
@@ -2658,6 +3192,7 @@ PUGI__NS_BEGIN
{
// store value and step over >
cursor->value = value;
+
PUGI__POPNODE();
PUGI__ENDSEG();
@@ -2686,7 +3221,7 @@ PUGI__NS_BEGIN
{
strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
-
+
char_t ch = 0;
xml_node_struct* cursor = root;
char_t* mark = s;
@@ -2717,10 +3252,10 @@ PUGI__NS_BEGIN
while (true)
{
PUGI__SKIPWS(); // Eat any whitespace.
-
+
if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
{
- xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute.
+ xml_attribute_struct* a = append_new_attribute(cursor, *alloc); // Make space for this attribute.
if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
a->name = s; // Save the offset.
@@ -2735,7 +3270,7 @@ PUGI__NS_BEGIN
ch = *s;
++s;
}
-
+
if (ch == '=') // '<... #=...'
{
PUGI__SKIPWS(); // Eat any whitespace.
@@ -2747,7 +3282,7 @@ PUGI__NS_BEGIN
a->value = s; // Save the offset.
s = strconv_attribute(s, ch);
-
+
if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
// After this line the loop continues from the start;
@@ -2762,7 +3297,7 @@ PUGI__NS_BEGIN
else if (*s == '/')
{
++s;
-
+
if (*s == '>')
{
PUGI__POPNODE();
@@ -2803,7 +3338,7 @@ PUGI__NS_BEGIN
{
// we stepped over null terminator, backtrack & handle closing tag
--s;
-
+
if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
}
else PUGI__THROW_ERROR(status_bad_start_element, s);
@@ -2812,20 +3347,22 @@ PUGI__NS_BEGIN
{
++s;
+ mark = s;
+
char_t* name = cursor->name;
- if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
-
+ if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, mark);
+
while (PUGI__IS_CHARTYPE(*s, ct_symbol))
{
- if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+ if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, mark);
}
if (*name)
{
if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
- else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+ else PUGI__THROW_ERROR(status_end_element_mismatch, mark);
}
-
+
PUGI__POPNODE(); // Pop.
PUGI__SKIPWS();
@@ -2879,23 +3416,31 @@ PUGI__NS_BEGIN
if (!PUGI__OPTSET(parse_trim_pcdata))
s = mark;
-
+
if (cursor->parent || PUGI__OPTSET(parse_fragment))
{
- PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
- cursor->value = s; // Save the offset.
+ if (PUGI__OPTSET(parse_embed_pcdata) && cursor->parent && !cursor->first_child && !cursor->value)
+ {
+ cursor->value = s; // Save the offset.
+ }
+ else
+ {
+ PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+
+ cursor->value = s; // Save the offset.
+
+ PUGI__POPNODE(); // Pop since this is a standalone.
+ }
s = strconv_pcdata(s);
-
- PUGI__POPNODE(); // Pop since this is a standalone.
-
+
if (!*s) break;
}
else
{
PUGI__SCANFOR(*s == '<'); // '...<'
if (!*s) break;
-
+
++s;
}
@@ -2937,32 +3482,26 @@ PUGI__NS_BEGIN
static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
{
- // allocator object is a part of document object
- xml_allocator& alloc_ = *static_cast(xmldoc);
-
// early-out for empty documents
if (length == 0)
return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
// get last child of the root before parsing
- xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
-
+ xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 : 0;
+
// create parser on stack
- xml_parser parser(alloc_);
+ xml_parser parser(static_cast(xmldoc));
// save last character and make buffer zero-terminated (speeds up parsing)
char_t endch = buffer[length - 1];
buffer[length - 1] = 0;
-
+
// skip BOM to make sure it does not end up as part of parse output
char_t* buffer_data = parse_skip_bom(buffer);
// perform actual parsing
parser.parse_tree(buffer_data, root, optmsk, endch);
- // update allocator state
- alloc_ = parser.alloc;
-
xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
assert(result.offset >= 0 && static_cast(result.offset) <= length);
@@ -2973,7 +3512,7 @@ PUGI__NS_BEGIN
return make_parse_result(status_unrecognized_tag, length - 1);
// check if there are any element nodes parsed
- xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
+ xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling + 0 : root->first_child+ 0;
if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
return make_parse_result(status_no_document_element, length - 1);
@@ -3017,12 +3556,36 @@ PUGI__NS_BEGIN
return encoding_utf8;
}
+ template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T)
+ {
+ PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type));
+
+ typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T());
+
+ return static_cast(end - dest) * sizeof(*dest);
+ }
+
+ template PUGI__FN size_t convert_buffer_output_generic(typename T::value_type dest, const char_t* data, size_t length, D, T, bool opt_swap)
+ {
+ PUGI__STATIC_ASSERT(sizeof(char_t) == sizeof(typename D::type));
+
+ typename T::value_type end = D::process(reinterpret_cast(data), length, dest, T());
+
+ if (opt_swap)
+ {
+ for (typename T::value_type i = dest; i != end; ++i)
+ *i = endian_swap(*i);
+ }
+
+ return static_cast(end - dest) * sizeof(*dest);
+ }
+
#ifdef PUGIXML_WCHAR_MODE
PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
{
if (length < 1) return 0;
- // discard last character if it's the lead of a surrogate pair
+ // discard last character if it's the lead of a surrogate pair
return (sizeof(wchar_t) == 2 && static_cast(static_cast(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
}
@@ -3035,58 +3598,32 @@ PUGI__NS_BEGIN
return length * sizeof(char_t);
}
-
+
// convert to utf8
if (encoding == encoding_utf8)
- {
- uint8_t* dest = r_u8;
- uint8_t* end = utf_decoder::decode_wchar_block(data, length, dest);
-
- return static_cast(end - dest);
- }
+ return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), utf8_writer());
// convert to utf16
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
- uint16_t* dest = r_u16;
-
- // convert to native utf16
- uint16_t* end = utf_decoder::decode_wchar_block(data, length, dest);
-
- // swap if necessary
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
- if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest));
-
- return static_cast(end - dest) * sizeof(uint16_t);
+ return convert_buffer_output_generic(r_u16, data, length, wchar_decoder(), utf16_writer(), native_encoding != encoding);
}
// convert to utf32
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
- uint32_t* dest = r_u32;
-
- // convert to native utf32
- uint32_t* end = utf_decoder::decode_wchar_block(data, length, dest);
-
- // swap if necessary
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
- if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest));
-
- return static_cast(end - dest) * sizeof(uint32_t);
+ return convert_buffer_output_generic(r_u32, data, length, wchar_decoder(), utf32_writer(), native_encoding != encoding);
}
// convert to latin1
if (encoding == encoding_latin1)
- {
- uint8_t* dest = r_u8;
- uint8_t* end = utf_decoder::decode_wchar_block(data, length, dest);
+ return convert_buffer_output_generic(r_u8, data, length, wchar_decoder(), latin1_writer());
- return static_cast(end - dest);
- }
-
- assert(!"Invalid encoding");
+ assert(false && "Invalid encoding");
return 0;
}
#else
@@ -3110,43 +3647,22 @@ PUGI__NS_BEGIN
{
if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
{
- uint16_t* dest = r_u16;
-
- // convert to native utf16
- uint16_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest);
-
- // swap if necessary
xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
- if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest));
-
- return static_cast(end - dest) * sizeof(uint16_t);
+ return convert_buffer_output_generic(r_u16, data, length, utf8_decoder(), utf16_writer(), native_encoding != encoding);
}
if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
{
- uint32_t* dest = r_u32;
-
- // convert to native utf32
- uint32_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest);
-
- // swap if necessary
xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
- if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast(end - dest));
-
- return static_cast(end - dest) * sizeof(uint32_t);
+ return convert_buffer_output_generic(r_u32, data, length, utf8_decoder(), utf32_writer(), native_encoding != encoding);
}
if (encoding == encoding_latin1)
- {
- uint8_t* dest = r_u8;
- uint8_t* end = utf_decoder::decode_utf8_block(reinterpret_cast(data), length, dest);
-
- return static_cast(end - dest);
- }
+ return convert_buffer_output_generic(r_u8, data, length, utf8_decoder(), latin1_writer());
- assert(!"Invalid encoding");
+ assert(false && "Invalid encoding");
return 0;
}
#endif
@@ -3162,11 +3678,6 @@ PUGI__NS_BEGIN
PUGI__STATIC_ASSERT(bufcapacity >= 8);
}
- ~xml_buffered_writer()
- {
- flush();
- }
-
size_t flush()
{
flush(buffer, bufsize);
@@ -3375,10 +3886,10 @@ PUGI__NS_BEGIN
while (*s)
{
const char_t* prev = s;
-
+
// While *s is a usual symbol
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
-
+
writer.write_buffer(prev, static_cast(s - prev));
switch (*s)
@@ -3506,14 +4017,45 @@ PUGI__NS_BEGIN
writer.write('-', '-', '>');
}
- PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+ PUGI__FN void node_output_pi_value(xml_buffered_writer& writer, const char_t* s)
+ {
+ while (*s)
+ {
+ const char_t* prev = s;
+
+ // look for ?> sequence - we can't output it since ?> terminates PI
+ while (*s && !(s[0] == '?' && s[1] == '>')) ++s;
+
+ writer.write_buffer(prev, static_cast(s - prev));
+
+ if (*s)
+ {
+ assert(s[0] == '?' && s[1] == '>');
+
+ writer.write('?', ' ', '>');
+ s += 2;
+ }
+ }
+ }
+
+ PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth)
{
const char_t* default_name = PUGIXML_TEXT(":anonymous");
for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
{
- writer.write(' ');
- writer.write_string(a->name ? a->name : default_name);
+ if ((flags & (format_indent_attributes | format_raw)) == format_indent_attributes)
+ {
+ writer.write('\n');
+
+ text_output_indent(writer, indent, indent_length, depth + 1);
+ }
+ else
+ {
+ writer.write(' ');
+ }
+
+ writer.write_string(a->name ? a->name + 0 : default_name);
writer.write('=', '"');
if (a->value)
@@ -3523,21 +4065,40 @@ PUGI__NS_BEGIN
}
}
- PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+ PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, const char_t* indent, size_t indent_length, unsigned int flags, unsigned int depth)
{
const char_t* default_name = PUGIXML_TEXT(":anonymous");
- const char_t* name = node->name ? node->name : default_name;
+ const char_t* name = node->name ? node->name + 0 : default_name;
writer.write('<');
writer.write_string(name);
if (node->first_attribute)
- node_output_attributes(writer, node, flags);
+ node_output_attributes(writer, node, indent, indent_length, flags, depth);
- if (flags & format_raw)
+ // element nodes can have value if parse_embed_pcdata was used
+ if (!node->value)
{
if (!node->first_child)
- writer.write(' ', '/', '>');
+ {
+ if (flags & format_no_empty_element_tags)
+ {
+ writer.write('>', '<', '/');
+ writer.write_string(name);
+ writer.write('>');
+
+ return false;
+ }
+ else
+ {
+ if ((flags & format_raw) == 0)
+ writer.write(' ');
+
+ writer.write('/', '>');
+
+ return false;
+ }
+ }
else
{
writer.write('>');
@@ -3547,48 +4108,33 @@ PUGI__NS_BEGIN
}
else
{
- xml_node_struct* first = node->first_child;
-
- if (!first)
- writer.write(' ', '/', '>', '\n');
- else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata))
- {
- writer.write('>');
-
- const char_t* value = first->value ? first->value : PUGIXML_TEXT("");
+ writer.write('>');
- if (PUGI__NODETYPE(first) == node_pcdata)
- text_output(writer, value, ctx_special_pcdata, flags);
- else
- text_output_cdata(writer, value);
+ text_output(writer, node->value, ctx_special_pcdata, flags);
+ if (!node->first_child)
+ {
writer.write('<', '/');
writer.write_string(name);
- writer.write('>', '\n');
+ writer.write('>');
+
+ return false;
}
else
{
- writer.write('>', '\n');
-
return true;
}
}
-
- return false;
}
- PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+ PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node)
{
const char_t* default_name = PUGIXML_TEXT(":anonymous");
- const char_t* name = node->name ? node->name : default_name;
+ const char_t* name = node->name ? node->name + 0 : default_name;
writer.write('<', '/');
writer.write_string(name);
-
- if (flags & format_raw)
- writer.write('>');
- else
- writer.write('>', '\n');
+ writer.write('>');
}
PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
@@ -3598,40 +4144,35 @@ PUGI__NS_BEGIN
switch (PUGI__NODETYPE(node))
{
case node_pcdata:
- text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags);
- if ((flags & format_raw) == 0) writer.write('\n');
+ text_output(writer, node->value ? node->value + 0 : PUGIXML_TEXT(""), ctx_special_pcdata, flags);
break;
case node_cdata:
- text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT(""));
- if ((flags & format_raw) == 0) writer.write('\n');
+ text_output_cdata(writer, node->value ? node->value + 0 : PUGIXML_TEXT(""));
break;
case node_comment:
- node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT(""));
- if ((flags & format_raw) == 0) writer.write('\n');
+ node_output_comment(writer, node->value ? node->value + 0 : PUGIXML_TEXT(""));
break;
case node_pi:
writer.write('<', '?');
- writer.write_string(node->name ? node->name : default_name);
+ writer.write_string(node->name ? node->name + 0 : default_name);
if (node->value)
{
writer.write(' ');
- writer.write_string(node->value);
+ node_output_pi_value(writer, node->value);
}
writer.write('?', '>');
- if ((flags & format_raw) == 0) writer.write('\n');
break;
case node_declaration:
writer.write('<', '?');
- writer.write_string(node->name ? node->name : default_name);
- node_output_attributes(writer, node, flags);
+ writer.write_string(node->name ? node->name + 0 : default_name);
+ node_output_attributes(writer, node, PUGIXML_TEXT(""), 0, flags | format_raw, 0);
writer.write('?', '>');
- if ((flags & format_raw) == 0) writer.write('\n');
break;
case node_doctype:
@@ -3645,17 +4186,23 @@ PUGI__NS_BEGIN
}
writer.write('>');
- if ((flags & format_raw) == 0) writer.write('\n');
break;
default:
- assert(!"Invalid node type");
+ assert(false && "Invalid node type");
}
}
+ enum indent_flags_t
+ {
+ indent_newline = 1,
+ indent_indent = 2
+ };
+
PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth)
{
- size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0;
+ size_t indent_length = ((flags & (format_indent | format_indent_attributes)) && (flags & format_raw) == 0) ? strlength(indent) : 0;
+ unsigned int indent_flags = indent_indent;
xml_node_struct* node = root;
@@ -3664,30 +4211,52 @@ PUGI__NS_BEGIN
assert(node);
// begin writing current node
- if (indent_length)
- text_output_indent(writer, indent, indent_length, depth);
+ if (PUGI__NODETYPE(node) == node_pcdata || PUGI__NODETYPE(node) == node_cdata)
+ {
+ node_output_simple(writer, node, flags);
- if (PUGI__NODETYPE(node) == node_element)
+ indent_flags = 0;
+ }
+ else
{
- if (node_output_start(writer, node, flags))
+ if ((indent_flags & indent_newline) && (flags & format_raw) == 0)
+ writer.write('\n');
+
+ if ((indent_flags & indent_indent) && indent_length)
+ text_output_indent(writer, indent, indent_length, depth);
+
+ if (PUGI__NODETYPE(node) == node_element)
{
- node = node->first_child;
- depth++;
- continue;
+ indent_flags = indent_newline | indent_indent;
+
+ if (node_output_start(writer, node, indent, indent_length, flags, depth))
+ {
+ // element nodes can have value if parse_embed_pcdata was used
+ if (node->value)
+ indent_flags = 0;
+
+ node = node->first_child;
+ depth++;
+ continue;
+ }
}
- }
- else if (PUGI__NODETYPE(node) == node_document)
- {
- if (node->first_child)
+ else if (PUGI__NODETYPE(node) == node_document)
+ {
+ indent_flags = indent_indent;
+
+ if (node->first_child)
+ {
+ node = node->first_child;
+ continue;
+ }
+ }
+ else
{
- node = node->first_child;
- continue;
+ node_output_simple(writer, node, flags);
+
+ indent_flags = indent_newline | indent_indent;
}
}
- else
- {
- node_output_simple(writer, node, flags);
- }
// continue to the next node
while (node != root)
@@ -3705,14 +4274,22 @@ PUGI__NS_BEGIN
{
depth--;
- if (indent_length)
+ if ((indent_flags & indent_newline) && (flags & format_raw) == 0)
+ writer.write('\n');
+
+ if ((indent_flags & indent_indent) && indent_length)
text_output_indent(writer, indent, indent_length, depth);
- node_output_end(writer, node, flags);
+ node_output_end(writer, node);
+
+ indent_flags = indent_newline | indent_indent;
}
}
}
while (node != root);
+
+ if ((indent_flags & indent_newline) && (flags & format_raw) == 0)
+ writer.write('\n');
}
PUGI__FN bool has_declaration(xml_node_struct* node)
@@ -3775,7 +4352,8 @@ PUGI__NS_BEGIN
return true;
}
- PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc)
+ template
+ PUGI__FN void node_copy_string(String& dest, Header& header, uintptr_t header_mask, char_t* source, Header& source_header, xml_allocator* alloc)
{
assert(!dest && (header & header_mask) == 0);
@@ -3790,7 +4368,7 @@ PUGI__NS_BEGIN
source_header |= xml_memory_page_contents_shared_mask;
}
else
- strcpy_insitu(dest, header, header_mask, source);
+ strcpy_insitu(dest, header, header_mask, source, strlength(source));
}
}
@@ -3856,6 +4434,15 @@ PUGI__NS_BEGIN
}
}
+ PUGI__FN void node_copy_attribute(xml_attribute_struct* da, xml_attribute_struct* sa)
+ {
+ xml_allocator& alloc = get_allocator(da);
+ xml_allocator* shared_alloc = (&alloc == &get_allocator(sa)) ? &alloc : 0;
+
+ node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc);
+ node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc);
+ }
+
inline bool is_text_node(xml_node_struct* node)
{
xml_node_type type = PUGI__NODETYPE(node);
@@ -3864,49 +4451,93 @@ PUGI__NS_BEGIN
}
// get value with conversion functions
- PUGI__FN int get_integer_base(const char_t* value)
+ template U string_to_integer(const char_t* value, U minneg, U maxpos)
{
+ U result = 0;
const char_t* s = value;
while (PUGI__IS_CHARTYPE(*s, ct_space))
s++;
- if (*s == '-')
- s++;
+ bool negative = (*s == '-');
- return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10;
- }
+ s += (*s == '+' || *s == '-');
- PUGI__FN int get_value_int(const char_t* value, int def)
- {
- if (!value) return def;
+ bool overflow = false;
- int base = get_integer_base(value);
+ if (s[0] == '0' && (s[1] | ' ') == 'x')
+ {
+ s += 2;
- #ifdef PUGIXML_WCHAR_MODE
- return static_cast(wcstol(value, 0, base));
- #else
- return static_cast(strtol(value, 0, base));
- #endif
- }
+ // since overflow detection relies on length of the sequence skip leading zeros
+ while (*s == '0')
+ s++;
- PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
- {
- if (!value) return def;
+ const char_t* start = s;
- int base = get_integer_base(value);
+ for (;;)
+ {
+ if (static_cast