Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling UTF encoded inputs #296

Merged
merged 6 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 124 additions & 105 deletions include/fkYAML/detail/encodings/encode_detector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <istream>

#include <fkYAML/detail/macros/version_macros.hpp>
#include <fkYAML/detail/encodings/encode_t.hpp>
#include <fkYAML/detail/encodings/utf_encode_t.hpp>
#include <fkYAML/exception.hpp>

/// @brief namespace for fkYAML library.
Expand All @@ -27,63 +27,68 @@ namespace detail

/// @brief Detect an encoding type for UTF-8 expected inputs.
/// @note This function doesn't support the case where the first character is null.
/// @param b0 The 1st byte of an input character sequence.
/// @param b1 The 2nd byte of an input character sequence.
/// @param b2 The 3rd byte of an input character sequence.
/// @param b3 The 4th byte of an input character sequence.
/// @param[in] bytes 4 bytes of an input character sequence.
/// @param[out] has_bom Whether or not the input contains a BOM.
/// @return A detected encoding type.
inline encode_t detect_encoding_type(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3) noexcept
inline utf_encode_t detect_encoding_type(const std::array<uint8_t, 4>& bytes, bool& has_bom) noexcept
{
has_bom = false;

// Check if a BOM exists.

if (b0 == uint8_t(0xEFu) && b1 == uint8_t(0xBBu) && b2 == uint8_t(0xBFu))
if (bytes[0] == uint8_t(0xEFu) && bytes[1] == uint8_t(0xBBu) && bytes[2] == uint8_t(0xBFu))
{
return encode_t::UTF_8_BOM;
has_bom = true;
return utf_encode_t::UTF_8;
}

if (b0 == 0 && b1 == 0 && b2 == uint8_t(0xFEu) && b3 == uint8_t(0xFFu))
if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == uint8_t(0xFEu) && bytes[3] == uint8_t(0xFFu))
{
return encode_t::UTF_32BE_BOM;
has_bom = true;
return utf_encode_t::UTF_32BE;
}

if (b0 == uint8_t(0xFFu) && b1 == uint8_t(0xFEu) && b2 == 0 && b3 == 0)
if (bytes[0] == uint8_t(0xFFu) && bytes[1] == uint8_t(0xFEu) && bytes[2] == 0 && bytes[3] == 0)
{
return encode_t::UTF_32LE_BOM;
has_bom = true;
return utf_encode_t::UTF_32LE;
}

if (b0 == uint8_t(0xFEu) && b1 == uint8_t(0xFFu))
if (bytes[0] == uint8_t(0xFEu) && bytes[1] == uint8_t(0xFFu))
{
return encode_t::UTF_16BE_BOM;
has_bom = true;
return utf_encode_t::UTF_16BE;
}

if (b0 == uint8_t(0xFFu) && b1 == uint8_t(0xFEu))
if (bytes[0] == uint8_t(0xFFu) && bytes[1] == uint8_t(0xFEu))
{
return encode_t::UTF_16LE_BOM;
has_bom = true;
return utf_encode_t::UTF_16LE;
}

// Test the first character assuming it's an ASCII character.

if (b0 == 0 && b1 == 0 && b2 == 0 && 0 < b3 && b3 < uint8_t(0x80u))
if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0 && 0 < bytes[3] && bytes[3] < uint8_t(0x80u))
{
return encode_t::UTF_32BE_N;
return utf_encode_t::UTF_32BE;
}

if (0 < b0 && b0 < uint8_t(0x80u) && b1 == 0 && b2 == 0 && b3 == 0)
if (0 < bytes[0] && bytes[0] < uint8_t(0x80u) && bytes[1] == 0 && bytes[2] == 0 && bytes[3] == 0)
{
return encode_t::UTF_32LE_N;
return utf_encode_t::UTF_32LE;
}

if (b0 == 0 && 0 < b1 && b1 < uint8_t(0x80u))
if (bytes[0] == 0 && 0 < bytes[1] && bytes[1] < uint8_t(0x80u))
{
return encode_t::UTF_16BE_N;
return utf_encode_t::UTF_16BE;
}

if (0 < b0 && b0 < uint8_t(0x80u) && b1 == 0)
if (0 < bytes[0] && bytes[0] < uint8_t(0x80u) && bytes[1] == 0)
{
return encode_t::UTF_16LE_N;
return utf_encode_t::UTF_16LE;
}

return encode_t::UTF_8_N;
return utf_encode_t::UTF_8;
}

/// @brief Detects the encoding type of the input, and consumes a BOM if it exists.
Expand All @@ -93,9 +98,9 @@ inline encode_t detect_encoding_type(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t
/// @param end The end of input iterators.
/// @return A detected encoding type.
template <typename ItrType, size_t ElemSize = sizeof(decltype(*(std::declval<ItrType>())))>
inline encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
inline utf_encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
{
uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
switch (ElemSize)
{
case sizeof(char): { // this case covers char8_t as well when compiled with C++20 features.
Expand All @@ -104,86 +109,92 @@ inline encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
bytes[i] = uint8_t(begin[i]);
}

encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
switch (encode_type)
bool has_bom = false;
utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);

if (has_bom)
{
case encode_t::UTF_8_BOM:
std::advance(begin, 3);
break;
case encode_t::UTF_16BE_BOM:
case encode_t::UTF_16LE_BOM:
std::advance(begin, 2);
break;
case encode_t::UTF_32BE_BOM:
case encode_t::UTF_32LE_BOM:
std::advance(begin, 4);
break;
default:
// Do nothing if a BOM doesn't exist.
break;
// skip reading the BOM.
switch (encode_type)
{
case utf_encode_t::UTF_8:
std::advance(begin, 3);
break;
case utf_encode_t::UTF_16BE:
case utf_encode_t::UTF_16LE:
std::advance(begin, 2);
break;
case utf_encode_t::UTF_32BE:
case utf_encode_t::UTF_32LE:
std::advance(begin, 4);
break;
}
}

return encode_type;
}
case sizeof(char16_t): {
if (begin == end)
{
return encode_t::UTF_16BE_N;
return utf_encode_t::UTF_16BE;
}
for (int i = 0; i < 2 && begin + i != end; i++)
{
bytes[i * 2] = uint8_t((begin[i] & 0xFF00u) >> 8);
bytes[i * 2 + 1] = uint8_t(begin[i] & 0xFFu);
}

encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
switch (encode_type)
bool has_bom = false;
utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);

if (encode_type != utf_encode_t::UTF_16BE && encode_type != utf_encode_t::UTF_16LE)
{
case encode_t::UTF_16BE_BOM:
case encode_t::UTF_16LE_BOM:
std::advance(begin, 1);
break;
case encode_t::UTF_16BE_N:
case encode_t::UTF_16LE_N:
// Do nothing if a BOM doesn't exist.
break;
default:
throw exception("char16_t characters must be encoded in the UTF-16 format.");
}

if (has_bom)
{
// skip reading the BOM.
std::advance(begin, 1);
}

return encode_type;
}
case sizeof(char32_t): {
if (begin == end)
{
return encode_t::UTF_32BE_N;
return utf_encode_t::UTF_32BE;
}

bytes[0] = uint8_t((*begin & 0xFF000000u) >> 24);
bytes[1] = uint8_t((*begin & 0x00FF0000u) >> 16);
bytes[2] = uint8_t((*begin & 0x0000FF00u) >> 8);
bytes[3] = uint8_t(*begin & 0x000000FFu);
encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
switch (encode_type)

bool has_bom = false;
utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);

if (encode_type != utf_encode_t::UTF_32BE && encode_type != utf_encode_t::UTF_32LE)
{
case encode_t::UTF_32BE_BOM:
case encode_t::UTF_32LE_BOM:
std::advance(begin, 1);
break;
case encode_t::UTF_32BE_N:
case encode_t::UTF_32LE_N:
// Do nothing if a BOM doesn't exist.
break;
default:
throw exception("char32_t characters must be encoded in the UTF-32 format.");
}

if (has_bom)
{
// skip reading the BOM.
std::advance(begin, 1);
}

return encode_type;
}
default:
throw exception("Unknown char size.");
}
}

inline encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
inline utf_encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
{
uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
for (std::size_t i = 0; i < 4; i++)
{
char byte = 0;
Expand All @@ -195,32 +206,36 @@ inline encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
bytes[i] = uint8_t(byte & 0xFF);
}

encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
switch (encode_type)
bool has_bom = false;
utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);

// move back to the beginning if a BOM doesn't exist.
long offset = 0;
if (has_bom)
{
case encode_t::UTF_8_BOM:
fseek(file, 3, SEEK_SET);
break;
case encode_t::UTF_16BE_BOM:
case encode_t::UTF_16LE_BOM:
fseek(file, 2, SEEK_SET);
break;
case encode_t::UTF_32BE_BOM:
case encode_t::UTF_32LE_BOM:
fseek(file, 4, SEEK_SET);
break;
default:
// Move back to the beginning of the file contents if a BOM doesn't exist.
fseek(file, 0, SEEK_SET);
break;
switch (encode_type)
{
case utf_encode_t::UTF_8:
offset = 3;
break;
case utf_encode_t::UTF_16BE:
case utf_encode_t::UTF_16LE:
offset = 2;
break;
case utf_encode_t::UTF_32BE:
case utf_encode_t::UTF_32LE:
offset = 4;
break;
}
}
fseek(file, offset, SEEK_SET);

return encode_type;
}

inline encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
inline utf_encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
{
uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
for (std::size_t i = 0; i < 4; i++)
{
char ch = 0;
Expand All @@ -235,25 +250,29 @@ inline encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
bytes[i] = uint8_t(ch & 0xFF);
}

encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
switch (encode_type)
bool has_bom = false;
utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);

// move back to the beginning if a BOM doesn't exist.
std::streamoff offset = 0;
if (has_bom)
{
case encode_t::UTF_8_BOM:
is.seekg(3, std::ios_base::beg);
break;
case encode_t::UTF_16BE_BOM:
case encode_t::UTF_16LE_BOM:
is.seekg(2, std::ios_base::beg);
break;
case encode_t::UTF_32BE_BOM:
case encode_t::UTF_32LE_BOM:
is.seekg(4, std::ios_base::beg);
break;
default:
// Move back to the beginning of the file contents if a BOM doesn't exist.
is.seekg(0, std::ios_base::beg);
break;
switch (encode_type)
{
case utf_encode_t::UTF_8:
offset = 3;
break;
case utf_encode_t::UTF_16BE:
case utf_encode_t::UTF_16LE:
offset = 2;
break;
case utf_encode_t::UTF_32BE:
case utf_encode_t::UTF_32LE:
offset = 4;
break;
}
}
is.seekg(offset, std::ios_base::beg);

return encode_type;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
///
/// @file

#ifndef FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_
#define FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_
#ifndef FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_
#define FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_

#include <fkYAML/detail/macros/version_macros.hpp>

Expand All @@ -22,22 +22,17 @@ namespace detail

/// @brief Definition of Unicode encoding types
/// @note Since fkYAML doesn't treat UTF-16/UTF-32 encoded characters per byte, endians do not matter.
enum class encode_t
enum class utf_encode_t
{
UTF_8_N, //!< UTF-8 without BOM
UTF_8_BOM, //!< UTF-8 with BOM
UTF_16BE_N, //!< UTF-16BE without BOM
UTF_16BE_BOM, //!< UTF-16BE with BOM
UTF_16LE_N, //!< UTF-16LE without BOM
UTF_16LE_BOM, //!< UTF-16LE with BOM
UTF_32BE_N, //!< UTF-32BE without BOM
UTF_32BE_BOM, //!< UTF-32BE with BOM
UTF_32LE_N, //!< UTF-32LE without BOM
UTF_32LE_BOM, //!< UTF-32LE with BOM
UTF_8, //!< UTF-8
UTF_16BE, //!< UTF-16 Big Endian
UTF_16LE, //!< UTF-16 Little Endian
UTF_32BE, //!< UTF-32 Big Endian
UTF_32LE, //!< UTF-32 Little Endian
};

} // namespace detail

FK_YAML_NAMESPACE_END

#endif /* FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_ */
#endif /* FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_ */
Loading