Skip to content

Commit

Permalink
Merge pull request #221 from fktn-k/feature/209_support_utf16_utf32_i…
Browse files Browse the repository at this point in the history
…nput_for_parse

#209 support UTF-16 / UTF-32 for deserialization input characters
  • Loading branch information
fktn-k committed Nov 23, 2023
2 parents 0adc43c + 441c226 commit 00d0433
Show file tree
Hide file tree
Showing 9 changed files with 968 additions and 225 deletions.
13 changes: 10 additions & 3 deletions docs/mkdocs/docs/api/basic_node/deserialize.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ static basic_node deserialize(PtrType&& ptr, std::size_t size); // (3)
Deserializes from compatible input sources.
Throws a [`fkyaml::exception`](../exception/index.md) if the deserialization process detects an error from the input sources.
!!! note "Supported Unicode Encodings"
fkYAML supports UTF-8, UTF-16 and UTF-32 encodings for input characters.
Note that input characters must be encoded in the UTF-8 format when deserializing with `FILE*` or `std::istreams` objects.
An array/container of `char`, `char16_t` and `char32_t` denotes that its contents are encoded in the UTF-8, UTF-16 and UTF-32 format, respectively.
The deserialization process internally converts input characters into the the UTF-8 encoded ones if they are encoded in the UTF-16 or UTF-32 format.
## Overload (1)
```cpp
Expand All @@ -30,10 +37,10 @@ static basic_node deserialize(InputType&& input);

* an `std::istream` object
* a `FILE` pointer (must not be `nullptr`)
* a C-style array of characters
* a pointer to a null-terminated string of single byte characters.
* a C-style array of characters (`char`, `char16_t` or `char32_t`. See the "Supported Unicode Encodings" above.)
* char[N], char16_t[N], or char32_t[N] (N: the size of an array)
* a container `obj` for which `begin(obj)` and `end(obj)` produces a valid pair of iterators
* std::string, std::array<char>, and the likes.
* std::basic_string, std::array, and the likes.

### **Parameters**

Expand Down
1 change: 1 addition & 0 deletions docs/mkdocs/docs/api/basic_node/extraction_operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ inline std::istream& operator>>(std::istream& is, basic_node& n);
Insertion operator for basic_node template class.
Deserializes an input stream into a [`basic_node`](index.md).
This API is a wrapper of [`basic_node::deserialize()`](deserialize.md) function for input streams to simplify the implementation in the user's code.
Note that the contents of the input stream must be encoded in the UTF-8 format.

## **Parameters**

Expand Down
1 change: 1 addition & 0 deletions docs/mkdocs/docs/tutorials/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Let's start with a really simple example.
Say you have an example.yaml file and now you want to load the contents.
Note that the following example files assumes that you have installed the fkYAML library somewhere on your machine.
See [the CMake Integration section]() for the other ways and modify the implementation if necessary.
Also, Make sure the example.yaml file is encoded in the UTF-8 format.

```title="Project Structure"
.
Expand Down
327 changes: 327 additions & 0 deletions include/fkYAML/detail/encodings/utf8_encoding.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
/// _______ __ __ __ _____ __ __ __
/// | __| |_/ | \_/ |/ _ \ / \/ \| | fkYAML: A C++ header-only YAML library
/// | __| _ < \_ _/| ___ | _ | |___ version 0.2.1
/// |__| |_| \__| |_| |_| |_|___||___|______| https://github.com/fktn-k/fkYAML
///
/// SPDX-FileCopyrightText: 2023 Kensuke Fukutani <fktn.dev@gmail.com>
/// SPDX-License-Identifier: MIT
///
/// @file

#ifndef FK_YAML_DETAIL_ENCODINGS_UTF_ENCODING_HPP_
#define FK_YAML_DETAIL_ENCODINGS_UTF_ENCODING_HPP_

#include <array>
#include <cstdint>

#include <fkYAML/detail/macros/version_macros.hpp>
#include <fkYAML/exception.hpp>

/// @brief namespace for fkYAML library.
FK_YAML_NAMESPACE_BEGIN

/// @brief namespace for internal implementations of fkYAML library.
namespace detail
{

template <typename CharType>
class utf_encoding;

/////////////////////////
// UTF-8 Encoding ///
/////////////////////////

class utf8_encoding
{
using int_type = std::char_traits<char>::int_type;

public:
/// @brief Validates the encoding of a given byte array whose length is 1.
/// @param[in] byte_array The byte array to be validated.
/// @return true if a given byte array is valid, false otherwise.
static bool validate(std::array<int_type, 1> byte_array) noexcept
{
// U+0000..U+007F
return (0x00 <= byte_array[0] && byte_array[0] <= 0x7F);
}

/// @brief Validates the encoding of a given byte array whose length is 2.
/// @param[in] byte_array The byte array to be validated.
/// @return true if a given byte array is valid, false otherwise.
static bool validate(std::array<int_type, 2> byte_array) noexcept
{
// U+0080..U+07FF
// 1st Byte: 0xC2..0xDF
// 2nd Byte: 0x80..0xBF
if (0xC2 <= byte_array[0] && byte_array[0] <= 0xDF)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0xBF)
{
return true;
}
}

// The rest of byte combinations are invalid.
return false;
}

/// @brief Validates the encoding of a given byte array whose length is 3.
/// @param[in] byte_array The byte array to be validated.
/// @return true if a given byte array is valid, false otherwise.
static bool validate(std::array<int_type, 3> byte_array) noexcept
{
// U+1000..U+CFFF:
// 1st Byte: 0xE0..0xEC
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if (0xE0 <= byte_array[0] && byte_array[0] <= 0xEC)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0xBF)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
return true;
}
}
return false;
}

// U+D000..U+D7FF:
// 1st Byte: 0xED
// 2nd Byte: 0x80..0x9F
// 3rd Byte: 0x80..0xBF
if (byte_array[0] == 0xED)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0x9F)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
return true;
}
}
return false;
}

// U+E000..U+FFFF:
// 1st Byte: 0xEE..0xEF
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
if (byte_array[0] == 0xEE || byte_array[0] == 0xEF)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0xBF)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
return true;
}
}
return false;
}

// The rest of byte combinations are invalid.
return false;
}

/// @brief Validates the encoding of a given byte array whose length is 4.
/// @param[in] byte_array The byte array to be validated.
/// @return true if a given byte array is valid, false otherwise.
static bool validate(std::array<int_type, 4> byte_array) noexcept
{
// U+10000..U+3FFFF:
// 1st Byte: 0xF0
// 2nd Byte: 0x90..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (byte_array[0] == 0xF0)
{
if (0x90 <= byte_array[1] && byte_array[1] <= 0xBF)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
if (0x80 <= byte_array[3] && byte_array[3] <= 0xBF)
{
return true;
}
}
}
return false;
}

// U+40000..U+FFFFF:
// 1st Byte: 0xF1..0xF3
// 2nd Byte: 0x80..0xBF
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (0xF1 <= byte_array[0] && byte_array[0] <= 0xF3)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0xBF)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
if (0x80 <= byte_array[3] && byte_array[3] <= 0xBF)
{
return true;
}
}
}
return false;
}

// U+100000..U+10FFFF:
// 1st Byte: 0xF4
// 2nd Byte: 0x80..0x8F
// 3rd Byte: 0x80..0xBF
// 4th Byte: 0x80..0xBF
if (byte_array[0] == 0xF4)
{
if (0x80 <= byte_array[1] && byte_array[1] <= 0x8F)
{
if (0x80 <= byte_array[2] && byte_array[2] <= 0xBF)
{
if (0x80 <= byte_array[3] && byte_array[3] <= 0xBF)
{
return true;
}
}
}
return false;
}

// The rest of byte combinations are invalid.
return false;
}

/// @brief Converts UTF-16 encoded characters to UTF-8 encoded bytes.
/// @param[in] utf16 UTF-16 encoded character(s).
/// @param[out] utf8_bytes UTF-8 encoded bytes.
/// @param[out] consumed_size The number of UTF-16 encoded characters used for the conversion.
/// @param[out] encoded_size The size of UTF-encoded bytes.
static void from_utf16(
std::array<char16_t, 2> utf16, std::array<char, 4>& utf8_bytes, std::size_t& consumed_size,
std::size_t& encoded_size)
{
utf8_bytes.fill(0);
consumed_size = 0;
encoded_size = 0;
bool is_valid = false;

if (utf16[0] < char16_t(0x80u))
{
utf8_bytes[0] = static_cast<char>(utf16[0] & 0x7Fu);
consumed_size = 1;
encoded_size = 1;
is_valid = true;
}
else if (utf16[0] <= char16_t(0x7FFu))
{
uint16_t utf8_encoded = 0b1100000010000000;
utf8_encoded |= static_cast<uint16_t>((utf16[0] & 0x07C0u) << 2);
utf8_encoded |= static_cast<uint16_t>(utf16[0] & 0x003Fu);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF00u) >> 8);
utf8_bytes[1] = static_cast<char>(utf8_encoded & 0x00FFu);
consumed_size = 1;
encoded_size = 2;
is_valid = true;
}
else if (utf16[0] < char16_t(0xD800u) || char16_t(0xE000u) <= utf16[0])
{
uint32_t utf8_encoded = 0b111000001000000010000000;
utf8_encoded |= static_cast<uint32_t>((utf16[0] & 0xF000u) << 4);
utf8_encoded |= static_cast<uint32_t>((utf16[0] & 0x0FC0u) << 2);
utf8_encoded |= static_cast<uint32_t>(utf16[0] & 0x003Fu);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF0000u) >> 16);
utf8_bytes[1] = static_cast<char>((utf8_encoded & 0x00FF00u) >> 8);
utf8_bytes[2] = static_cast<char>(utf8_encoded & 0x0000FFu);
consumed_size = 1;
encoded_size = 3;
is_valid = true;
}
else if (utf16[0] <= char16_t(0xDBFFu) && char16_t(0xDC00u) <= utf16[1] && utf16[1] <= char16_t(0xDFFFu))
{
// for surrogate pairs
uint32_t code_point = 0x10000u + ((utf16[0] & 0x03FFu) << 10) + (utf16[1] & 0x03FFu);
uint32_t utf8_encoded = 0b11110000100000001000000010000000;
utf8_encoded |= static_cast<uint32_t>((code_point & 0x1C0000u) << 6);
utf8_encoded |= static_cast<uint32_t>((code_point & 0x03F000u) << 4);
utf8_encoded |= static_cast<uint32_t>((code_point & 0x000FC0u) << 2);
utf8_encoded |= static_cast<uint32_t>(code_point & 0x00003Fu);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF000000u) >> 24);
utf8_bytes[1] = static_cast<char>((utf8_encoded & 0x00FF0000u) >> 16);
utf8_bytes[2] = static_cast<char>((utf8_encoded & 0x0000FF00u) >> 8);
utf8_bytes[3] = static_cast<char>(utf8_encoded & 0x000000FFu);
consumed_size = 2;
encoded_size = 4;
is_valid = true;
}

if (!is_valid)
{
throw fkyaml::exception("Invalid UTF-16 encoding detected.");
}
}

/// @brief Converts a UTF-32 encoded character to UTF-8 encoded bytes.
/// @param[in] utf32 A UTF-32 encoded character.
/// @param[out] utf8_bytes UTF-8 encoded bytes.
/// @param[in] encoded_size The size of UTF-encoded bytes.
static void from_utf32(const char32_t utf32, std::array<char, 4>& utf8_bytes, std::size_t& encoded_size)
{
utf8_bytes.fill(0);
encoded_size = 0;
bool is_valid = false;

if (utf32 < char32_t(0x80u))
{
utf8_bytes[0] = static_cast<char>(utf32 & 0x007F);
encoded_size = 1;
is_valid = true;
}
else if (utf32 <= char32_t(0x7FFu))
{
uint16_t utf8_encoded = 0b1100000010000000;
utf8_encoded |= static_cast<uint16_t>((utf32 & 0x07C0u) << 2);
utf8_encoded |= static_cast<uint16_t>(utf32 & 0x003Fu);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF00u) >> 8);
utf8_bytes[1] = static_cast<char>(utf8_encoded & 0x00FFu);
encoded_size = 2;
is_valid = true;
}
else if (utf32 <= char32_t(0xFFFFu))
{
uint32_t utf8_encoded = 0b111000001000000010000000;
utf8_encoded |= static_cast<uint32_t>((utf32 & 0xF000u) << 4);
utf8_encoded |= static_cast<uint32_t>((utf32 & 0x0FC0u) << 2);
utf8_encoded |= static_cast<uint32_t>(utf32 & 0x003F);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF0000u) >> 16);
utf8_bytes[1] = static_cast<char>((utf8_encoded & 0x00FF00u) >> 8);
utf8_bytes[2] = static_cast<char>(utf8_encoded & 0x0000FFu);
encoded_size = 3;
is_valid = true;
}
else if (utf32 <= char32_t(0x10FFFFu))
{
uint32_t utf8_encoded = 0b11110000100000001000000010000000;
utf8_encoded |= static_cast<uint32_t>((utf32 & 0x1C0000u) << 6);
utf8_encoded |= static_cast<uint32_t>((utf32 & 0x03F000u) << 4);
utf8_encoded |= static_cast<uint32_t>((utf32 & 0x000FC0u) << 2);
utf8_encoded |= static_cast<uint32_t>(utf32 & 0x00003Fu);
utf8_bytes[0] = static_cast<char>((utf8_encoded & 0xFF000000u) >> 24);
utf8_bytes[1] = static_cast<char>((utf8_encoded & 0x00FF0000u) >> 16);
utf8_bytes[2] = static_cast<char>((utf8_encoded & 0x0000FF00u) >> 8);
utf8_bytes[3] = static_cast<char>(utf8_encoded & 0x000000FFu);
encoded_size = 4;
is_valid = true;
}

if (!is_valid)
{
throw fkyaml::exception("Invalid UTF-32 encoding detected.");
}
}
};

} // namespace detail

FK_YAML_NAMESPACE_END

#endif /* FK_YAML_DETAIL_ENCODINGS_UTF_ENCODING_HPP_ */

0 comments on commit 00d0433

Please sign in to comment.