fktn-k · fktn-k · Mar 24, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 22, 2024
@@ -15,7 +15,7 @@
 #include <istream>
 
 #include <fkYAML/detail/macros/version_macros.hpp>
-#include <fkYAML/detail/encodings/encode_t.hpp>
+#include <fkYAML/detail/encodings/utf_encode_t.hpp>
 #include <fkYAML/exception.hpp>
 
 /// @brief namespace for fkYAML library.
@@ -27,63 +27,68 @@ namespace detail
 
 /// @brief Detect an encoding type for UTF-8 expected inputs.
 /// @note This function doesn't support the case where the first character is null.
-/// @param b0 The 1st byte of an input character sequence.
-/// @param b1 The 2nd byte of an input character sequence.
-/// @param b2 The 3rd byte of an input character sequence.
-/// @param b3 The 4th byte of an input character sequence.
+/// @param[in] bytes 4 bytes of an input character sequence.
+/// @param[out] has_bom Whether or not the input contains a BOM.
 /// @return A detected encoding type.
-inline encode_t detect_encoding_type(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3) noexcept
+inline utf_encode_t detect_encoding_type(const std::array<uint8_t, 4>& bytes, bool& has_bom) noexcept
 {
+    has_bom = false;
+
     // Check if a BOM exists.
 
-    if (b0 == uint8_t(0xEFu) && b1 == uint8_t(0xBBu) && b2 == uint8_t(0xBFu))
+    if (bytes[0] == uint8_t(0xEFu) && bytes[1] == uint8_t(0xBBu) && bytes[2] == uint8_t(0xBFu))
     {
-        return encode_t::UTF_8_BOM;
+        has_bom = true;
+        return utf_encode_t::UTF_8;
     }
 
-    if (b0 == 0 && b1 == 0 && b2 == uint8_t(0xFEu) && b3 == uint8_t(0xFFu))
+    if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == uint8_t(0xFEu) && bytes[3] == uint8_t(0xFFu))
     {
-        return encode_t::UTF_32BE_BOM;
+        has_bom = true;
+        return utf_encode_t::UTF_32BE;
     }
 
-    if (b0 == uint8_t(0xFFu) && b1 == uint8_t(0xFEu) && b2 == 0 && b3 == 0)
+    if (bytes[0] == uint8_t(0xFFu) && bytes[1] == uint8_t(0xFEu) && bytes[2] == 0 && bytes[3] == 0)
     {
-        return encode_t::UTF_32LE_BOM;
+        has_bom = true;
+        return utf_encode_t::UTF_32LE;
     }
 
-    if (b0 == uint8_t(0xFEu) && b1 == uint8_t(0xFFu))
+    if (bytes[0] == uint8_t(0xFEu) && bytes[1] == uint8_t(0xFFu))
     {
-        return encode_t::UTF_16BE_BOM;
+        has_bom = true;
+        return utf_encode_t::UTF_16BE;
     }
 
-    if (b0 == uint8_t(0xFFu) && b1 == uint8_t(0xFEu))
+    if (bytes[0] == uint8_t(0xFFu) && bytes[1] == uint8_t(0xFEu))
     {
-        return encode_t::UTF_16LE_BOM;
+        has_bom = true;
+        return utf_encode_t::UTF_16LE;
     }
 
     // Test the first character assuming it's an ASCII character.
 
-    if (b0 == 0 && b1 == 0 && b2 == 0 && 0 < b3 && b3 < uint8_t(0x80u))
+    if (bytes[0] == 0 && bytes[1] == 0 && bytes[2] == 0 && 0 < bytes[3] && bytes[3] < uint8_t(0x80u))
     {
-        return encode_t::UTF_32BE_N;
+        return utf_encode_t::UTF_32BE;
     }
 
-    if (0 < b0 && b0 < uint8_t(0x80u) && b1 == 0 && b2 == 0 && b3 == 0)
+    if (0 < bytes[0] && bytes[0] < uint8_t(0x80u) && bytes[1] == 0 && bytes[2] == 0 && bytes[3] == 0)
     {
-        return encode_t::UTF_32LE_N;
+        return utf_encode_t::UTF_32LE;
     }
 
-    if (b0 == 0 && 0 < b1 && b1 < uint8_t(0x80u))
+    if (bytes[0] == 0 && 0 < bytes[1] && bytes[1] < uint8_t(0x80u))
     {
-        return encode_t::UTF_16BE_N;
+        return utf_encode_t::UTF_16BE;
     }
 
-    if (0 < b0 && b0 < uint8_t(0x80u) && b1 == 0)
+    if (0 < bytes[0] && bytes[0] < uint8_t(0x80u) && bytes[1] == 0)
     {
-        return encode_t::UTF_16LE_N;
+        return utf_encode_t::UTF_16LE;
     }
 
-    return encode_t::UTF_8_N;
+    return utf_encode_t::UTF_8;
 }
 
 /// @brief Detects the encoding type of the input, and consumes a BOM if it exists.
@@ -93,9 +98,9 @@ inline encode_t detect_encoding_type(uint8_t b0, uint8_t b1, uint8_t b2, uint8_t
 /// @param end The end of input iterators.
 /// @return A detected encoding type.
 template <typename ItrType, size_t ElemSize = sizeof(decltype(*(std::declval<ItrType>())))>
-inline encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
+inline utf_encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
 {
-    uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
+    std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
     switch (ElemSize)
     {
     case sizeof(char): { // this case covers char8_t as well when compiled with C++20 features.
@@ -104,86 +109,92 @@ inline encode_t detect_encoding_and_skip_bom(ItrType& begin, const ItrType& end)
             bytes[i] = uint8_t(begin[i]);
         }
 
-        encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
-        switch (encode_type)
+        bool has_bom = false;
+        utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);
+
+        if (has_bom)
         {
-        case encode_t::UTF_8_BOM:
-            std::advance(begin, 3);
-            break;
-        case encode_t::UTF_16BE_BOM:
-        case encode_t::UTF_16LE_BOM:
-            std::advance(begin, 2);
-            break;
-        case encode_t::UTF_32BE_BOM:
-        case encode_t::UTF_32LE_BOM:
-            std::advance(begin, 4);
-            break;
-        default:
-            // Do nothing if a BOM doesn't exist.
-            break;
+            // skip reading the BOM.
+            switch (encode_type)
+            {
+            case utf_encode_t::UTF_8:
+                std::advance(begin, 3);
+                break;
+            case utf_encode_t::UTF_16BE:
+            case utf_encode_t::UTF_16LE:
+                std::advance(begin, 2);
+                break;
+            case utf_encode_t::UTF_32BE:
+            case utf_encode_t::UTF_32LE:
+                std::advance(begin, 4);
+                break;
+            }
         }
+
         return encode_type;
     }
     case sizeof(char16_t): {
         if (begin == end)
         {
-            return encode_t::UTF_16BE_N;
+            return utf_encode_t::UTF_16BE;
         }
         for (int i = 0; i < 2 && begin + i != end; i++)
         {
             bytes[i * 2] = uint8_t((begin[i] & 0xFF00u) >> 8);
             bytes[i * 2 + 1] = uint8_t(begin[i] & 0xFFu);
         }
 
-        encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
-        switch (encode_type)
+        bool has_bom = false;
+        utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);
+
+        if (encode_type != utf_encode_t::UTF_16BE && encode_type != utf_encode_t::UTF_16LE)
         {
-        case encode_t::UTF_16BE_BOM:
-        case encode_t::UTF_16LE_BOM:
-            std::advance(begin, 1);
-            break;
-        case encode_t::UTF_16BE_N:
-        case encode_t::UTF_16LE_N:
-            // Do nothing if a BOM doesn't exist.
-            break;
-        default:
             throw exception("char16_t characters must be encoded in the UTF-16 format.");
         }
+
+        if (has_bom)
+        {
+            // skip reading the BOM.
+            std::advance(begin, 1);
+        }
+
         return encode_type;
     }
     case sizeof(char32_t): {
         if (begin == end)
         {
-            return encode_t::UTF_32BE_N;
+            return utf_encode_t::UTF_32BE;
         }
+
         bytes[0] = uint8_t((*begin & 0xFF000000u) >> 24);
         bytes[1] = uint8_t((*begin & 0x00FF0000u) >> 16);
         bytes[2] = uint8_t((*begin & 0x0000FF00u) >> 8);
         bytes[3] = uint8_t(*begin & 0x000000FFu);
-        encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
-        switch (encode_type)
+
+        bool has_bom = false;
+        utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);
+
+        if (encode_type != utf_encode_t::UTF_32BE && encode_type != utf_encode_t::UTF_32LE)
         {
-        case encode_t::UTF_32BE_BOM:
-        case encode_t::UTF_32LE_BOM:
-            std::advance(begin, 1);
-            break;
-        case encode_t::UTF_32BE_N:
-        case encode_t::UTF_32LE_N:
-            // Do nothing if a BOM doesn't exist.
-            break;
-        default:
             throw exception("char32_t characters must be encoded in the UTF-32 format.");
         }
+
+        if (has_bom)
+        {
+            // skip reading the BOM.
+            std::advance(begin, 1);
+        }
+
         return encode_type;
     }
     default:
         throw exception("Unknown char size.");
     }
 }
 
-inline encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
+inline utf_encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
 {
-    uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
+    std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
     for (std::size_t i = 0; i < 4; i++)
     {
         char byte = 0;
@@ -195,32 +206,36 @@ inline encode_t detect_encoding_and_skip_bom(std::FILE* file) noexcept
         bytes[i] = uint8_t(byte & 0xFF);
     }
 
-    encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
-    switch (encode_type)
+    bool has_bom = false;
+    utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);
+
+    // move back to the beginning if a BOM doesn't exist.
+    long offset = 0;
+    if (has_bom)
     {
-    case encode_t::UTF_8_BOM:
-        fseek(file, 3, SEEK_SET);
-        break;
-    case encode_t::UTF_16BE_BOM:
-    case encode_t::UTF_16LE_BOM:
-        fseek(file, 2, SEEK_SET);
-        break;
-    case encode_t::UTF_32BE_BOM:
-    case encode_t::UTF_32LE_BOM:
-        fseek(file, 4, SEEK_SET);
-        break;
-    default:
-        // Move back to the beginning of the file contents if a BOM doesn't exist.
-        fseek(file, 0, SEEK_SET);
-        break;
+        switch (encode_type)
+        {
+        case utf_encode_t::UTF_8:
+            offset = 3;
+            break;
+        case utf_encode_t::UTF_16BE:
+        case utf_encode_t::UTF_16LE:
+            offset = 2;
+            break;
+        case utf_encode_t::UTF_32BE:
+        case utf_encode_t::UTF_32LE:
+            offset = 4;
+            break;
+        }
     }
+    fseek(file, offset, SEEK_SET);
 
     return encode_type;
 }
 
-inline encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
+inline utf_encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
 {
-    uint8_t bytes[4] = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
+    std::array<uint8_t, 4> bytes = {0xFFu, 0xFFu, 0xFFu, 0xFFu};
     for (std::size_t i = 0; i < 4; i++)
     {
         char ch = 0;
@@ -235,25 +250,29 @@ inline encode_t detect_encoding_and_skip_bom(std::istream& is) noexcept
         bytes[i] = uint8_t(ch & 0xFF);
     }
 
-    encode_t encode_type = detect_encoding_type(bytes[0], bytes[1], bytes[2], bytes[3]);
-    switch (encode_type)
+    bool has_bom = false;
+    utf_encode_t encode_type = detect_encoding_type(bytes, has_bom);
+
+    // move back to the beginning if a BOM doesn't exist.
+    std::streamoff offset = 0;
+    if (has_bom)
     {
-    case encode_t::UTF_8_BOM:
-        is.seekg(3, std::ios_base::beg);
-        break;
-    case encode_t::UTF_16BE_BOM:
-    case encode_t::UTF_16LE_BOM:
-        is.seekg(2, std::ios_base::beg);
-        break;
-    case encode_t::UTF_32BE_BOM:
-    case encode_t::UTF_32LE_BOM:
-        is.seekg(4, std::ios_base::beg);
-        break;
-    default:
-        // Move back to the beginning of the file contents if a BOM doesn't exist.
-        is.seekg(0, std::ios_base::beg);
-        break;
+        switch (encode_type)
+        {
+        case utf_encode_t::UTF_8:
+            offset = 3;
+            break;
+        case utf_encode_t::UTF_16BE:
+        case utf_encode_t::UTF_16LE:
+            offset = 2;
+            break;
+        case utf_encode_t::UTF_32BE:
+        case utf_encode_t::UTF_32LE:
+            offset = 4;
+            break;
+        }
     }
+    is.seekg(offset, std::ios_base::beg);
 
     return encode_type;
 }

@@ -8,8 +8,8 @@
 ///
 /// @file
 
-#ifndef FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_
-#define FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_
+#ifndef FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_
+#define FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_
 
 #include <fkYAML/detail/macros/version_macros.hpp>
 
@@ -22,22 +22,17 @@ namespace detail
 
 /// @brief Definition of Unicode encoding types
 /// @note Since fkYAML doesn't treat UTF-16/UTF-32 encoded characters per byte, endians do not matter.
-enum class encode_t
+enum class utf_encode_t
 {
-    UTF_8_N,      //!< UTF-8 without BOM
-    UTF_8_BOM,    //!< UTF-8 with BOM
-    UTF_16BE_N,   //!< UTF-16BE without BOM
-    UTF_16BE_BOM, //!< UTF-16BE with BOM
-    UTF_16LE_N,   //!< UTF-16LE without BOM
-    UTF_16LE_BOM, //!< UTF-16LE with BOM
-    UTF_32BE_N,   //!< UTF-32BE without BOM
-    UTF_32BE_BOM, //!< UTF-32BE with BOM
-    UTF_32LE_N,   //!< UTF-32LE without BOM
-    UTF_32LE_BOM, //!< UTF-32LE with BOM
+    UTF_8,    //!< UTF-8
+    UTF_16BE, //!< UTF-16 Big Endian
+    UTF_16LE, //!< UTF-16 Little Endian
+    UTF_32BE, //!< UTF-32 Big Endian
+    UTF_32LE, //!< UTF-32 Little Endian
 };
 
 } // namespace detail
 
 FK_YAML_NAMESPACE_END
 
-#endif /* FK_YAML_DETAIL_ENCODINGS_ENCODE_T_HPP_ */
+#endif /* FK_YAML_DETAIL_ENCODINGS_UTF_ENCODE_T_HPP_ */