Add lexy::dsl::byte.if_/set/range/ascii methods to match spec…

…ific bytes Fixes #168.
foonathan · Sep 26, 2023 · bb36367 · bb36367
1 parent 0c2f487
commit bb36367
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 27 deletions.
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
@@ -4,6 +4,7 @@
 
 * Change `lexy::dsl::try_()` error recovery behavior:
   It will now skip whitespace after the (optional) error recovery rule.
+* Add `lexy::dsl::byte.if_`/`set`/`range`/`ascii` methods to match specific bytes.
 * Add an overload of `fatal_error()` on scanners that allow construction of type-erased generic errors (#134).
 * Add `lexy::buffer::release()` and `lexy::buffer::adopt()`.
 * Add default argument to `lexy::dsl::flag()`.

diff --git a/docs/content/reference/dsl/byte.adoc b/docs/content/reference/dsl/byte.adoc
@@ -27,26 +27,51 @@ Rules that match one or more bytes.
 ----
 namespace lexy::dsl
 {
+    class _bytes-dsl_ // models _token-rule_
+    {
+    public:
+        template <typename Predicate>
+        constexpr _token-rule_ auto if_() const;
+
+        template <unsigned char ... Bytes>
+        constexpr _token-rule_ auto set() const;
+        template <unsigned char Low, unsigned char High>
+        constexpr _token-rule_ auto range() const;
+
+        constexpr _token-rule_ auto ascii() const;
+    };
+
     template <std::size_t N>
-    constexpr _token-rule_ auto bytes;
+    constexpr _bytes-dsl auto bytes;
 
-    constexpr _token-rule_ auto byte = bytes<1>;
+    constexpr _bytes-dsl_ auto byte = bytes<1>;
 }
 ----
 
 [.lead]
-`bytes` is a {{% token-rule %}} that matches `N` arbitrary bytes.
+`bytes` is a {{% token-rule %}} that matches `N` bytes from a specified set.
 
 Requires::
   The input {{% encoding %}} is `lexy::byte_encoding`.
 Matching::
-  Matches and consumes `N` bytes.
+  Matches and consumes `N` bytes:
+    * By default, it matches arbitrary bytes in the range `[0x00, 0xFF]` (both sides inclusive).
+      Its name is `byte`.
+    * `.if_()`: matches bytes where the predicate returns true.
+      It must have a `constexpr bool operator()(unsigned char)`.
+      Its name is the type name of `Predicate`.
+    * `.set()`: matches the specified bytes.
+      Its name is `byte.set`.
+    * `.range()`: matches the bytes in the range `[Low, High]` (both sides inclusive).
+      Its name is `byte.range`.
+    * `.ascii()`: matches bytes that are also ASCII characters; i.e. in the range `[0x00, 0x7F]` (both sides inclusive).
+      Its name is `byte.ASCII`.
 Errors::
-  {{% docref "lexy::expected_char_class" %}} (`"byte"`):
-  if EOF was encountered early; at the position of the last byte it could consume.
+  {{% docref "lexy::expected_char_class" %}} (with the name as above):
+  if a mismatched byte or EOF was encountered early; at the position of the last byte it could consume.
   The rule then fails.
 Parse tree::
-  Single token node with the {{% docref "lexy::predefined_token_kind" %}} `lexy::any_token_kind`.
+  Single token node with the {{% docref "lexy::predefined_token_kind" %}} `lexy::any_token_kind` (default) or `lexy::unknown_token_kind` (with predicate/range).
 
 CAUTION: Combining {{% docref "lexy::dsl::capture" %}} with `bytes` does not do any endianness conversion.
 

diff --git a/include/lexy/dsl/byte.hpp b/include/lexy/dsl/byte.hpp
@@ -7,18 +7,36 @@
 #include <cstdint>
 #include <lexy/_detail/integer_sequence.hpp>
 #include <lexy/dsl/base.hpp>
+#include <lexy/dsl/char_class.hpp>
 #include <lexy/dsl/token.hpp>
 
 //=== byte ===//
 namespace lexyd
 {
-template <std::size_t N>
-struct _b : token_base<_b<N>>
+template <std::size_t N, typename Predicate>
+struct _b : token_base<_b<N, Predicate>>
 {
     static_assert(N > 0);
 
+    static constexpr bool _match(lexy::byte_encoding::int_type cur)
+    {
+        if (cur == lexy::byte_encoding::eof())
+            return false;
+
+        if constexpr (!std::is_void_v<Predicate>)
+        {
+            constexpr auto predicate = Predicate{};
+            return predicate(static_cast<lexy::byte_encoding::char_type>(cur));
+        }
+        else
+        {
+            return true;
+        }
+    }
+
     template <typename Reader, typename Indices = lexy::_detail::make_index_sequence<N>>
     struct tp;
+
     template <typename Reader, std::size_t... Idx>
     struct tp<Reader, lexy::_detail::index_sequence<Idx...>>
     {
@@ -31,34 +49,99 @@ struct _b : token_base<_b<N>>
             static_assert(std::is_same_v<typename Reader::encoding, lexy::byte_encoding>);
 
             // Bump N times.
-            auto result = ((reader.peek() == Reader::encoding::eof() ? ((void)Idx, false)
-                                                                     : (reader.bump(), true))
-                           && ...);
-            end         = reader.position();
+            auto result
+                = ((_match(reader.peek()) ? (reader.bump(), true) : ((void)Idx, false)) && ...);
+            end = reader.position();
             return result;
         }
 
         template <typename Context>
         constexpr void report_error(Context& context, const Reader&)
         {
-            auto err = lexy::error<Reader, lexy::expected_char_class>(end, "byte");
+            constexpr auto name
+                = std::is_void_v<Predicate> ? "byte" : lexy::_detail::type_name<Predicate>();
+            auto err = lexy::error<Reader, lexy::expected_char_class>(end, name);
             context.on(_ev::error{}, err);
         }
     };
+
+    //=== dsl ===//
+    template <typename P>
+    constexpr auto if_() const
+    {
+        static_assert(std::is_void_v<Predicate>);
+        return _b<N, P>{};
+    }
+
+    template <unsigned char Low, unsigned char High>
+    constexpr auto range() const
+    {
+        struct predicate
+        {
+            static LEXY_CONSTEVAL auto name()
+            {
+                return "byte.range";
+            }
+
+            constexpr bool operator()(unsigned char byte) const
+            {
+                return Low <= byte && byte <= High;
+            }
+        };
+
+        return if_<predicate>();
+    }
+
+    template <unsigned char... Bytes>
+    constexpr auto set() const
+    {
+        struct predicate
+        {
+            static LEXY_CONSTEVAL auto name()
+            {
+                return "byte.set";
+            }
+
+            constexpr bool operator()(unsigned char byte) const
+            {
+                return ((byte == Bytes) || ...);
+            }
+        };
+
+        return if_<predicate>();
+    }
+
+    constexpr auto ascii() const
+    {
+        struct predicate
+        {
+            static LEXY_CONSTEVAL auto name()
+            {
+                return "byte.ASCII";
+            }
+
+            constexpr bool operator()(unsigned char byte) const
+            {
+                return byte <= 0x7F;
+            }
+        };
+
+        return if_<predicate>();
+    }
 };
 
 /// Matches an arbitrary byte.
-constexpr auto byte = _b<1>{};
+constexpr auto byte = _b<1, void>{};
 
 /// Matches N arbitrary bytes.
 template <std::size_t N>
-constexpr auto bytes = _b<N>{};
+constexpr auto bytes = _b<N, void>{};
 } // namespace lexyd
 
 namespace lexy
 {
 template <std::size_t N>
-constexpr auto token_kind_of<lexy::dsl::_b<N>> = lexy::any_token_kind;
+constexpr auto token_kind_of<lexy::dsl::_b<N, void>> = lexy::any_token_kind;
 } // namespace lexy
 
 //=== padding bytes ===//
@@ -88,9 +171,9 @@ struct _pb : branch_base
 
         constexpr auto try_parse(const void*, const Reader& reader)
         {
-            lexy::token_parser_for<_b<N>, Reader> parser(reader);
-            auto                                  result = parser.try_parse(reader);
-            end                                          = parser.end;
+            lexy::token_parser_for<_b<N, void>, Reader> parser(reader);
+            auto                                        result = parser.try_parse(reader);
+            end                                                = parser.end;
             return result;
         }
 
@@ -118,7 +201,7 @@ struct _pb : branch_base
         LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args)
         {
             auto begin = reader.position();
-            if (!_b<N>::token_parse(context, reader))
+            if (!_b<N, void>::token_parse(context, reader))
                 return false;
             auto end = reader.position();
 
@@ -167,6 +250,7 @@ auto _bint()
         return 0;
     }
 }
+
 template <std::size_t N>
 using bint = decltype(_bint<N>());
 } // namespace lexy::_detail
@@ -187,10 +271,11 @@ namespace lexyd
 template <std::size_t N, int Endianness, typename Rule = void>
 struct _bint : branch_base
 {
-    using _rule = lexy::_detail::type_or<Rule, _b<N>>;
+    using _rule = lexy::_detail::type_or<Rule, _b<N, void>>;
 
     template <typename NextParser, typename Indices = lexy::_detail::make_index_sequence<N>>
     struct _pc;
+
     template <typename NextParser, std::size_t... Idx>
     struct _pc<NextParser, lexy::_detail::index_sequence<Idx...>>
     {
@@ -308,4 +393,3 @@ inline constexpr auto big_bint64    = _bint<8, lexy::_detail::bint_big>{};
 } // namespace lexyd
 
 #endif // LEXY_DSL_BYTE_HPP_INCLUDED
-
diff --git a/tests/lexy/dsl/byte.cpp b/tests/lexy/dsl/byte.cpp
@@ -10,9 +10,82 @@
 
 TEST_CASE("dsl::byte")
 {
-    constexpr auto rule = dsl::byte;
-    CHECK(lexy::is_token_rule<decltype(rule)>);
-    CHECK(equivalent_rules(rule, dsl::bytes<1>));
+    static constexpr auto callback = token_callback;
+
+    auto check = [](auto rule, const char* kind, const char* name, int byte) {
+        auto result = LEXY_VERIFY_RUNTIME(lexy::byte_encoding{}, static_cast<unsigned char>(byte));
+        if (result.status == test_result::fatal_error)
+        {
+            CHECK(result.trace == test_trace().expected_char_class(0, name).cancel());
+            return false;
+        }
+        else
+        {
+            char spelling[10];
+            std::sprintf(spelling, "\\%02X", byte);
+            CHECK(result.trace == test_trace().token(kind, spelling));
+            return true;
+        }
+    };
+
+    SUBCASE("any")
+    {
+        constexpr auto rule = dsl::byte;
+        CHECK(lexy::is_token_rule<decltype(rule)>);
+        CHECK(equivalent_rules(rule, dsl::bytes<1>));
+
+        auto empty = LEXY_VERIFY(lexy::byte_encoding{});
+        CHECK(empty.status == test_result::fatal_error);
+        CHECK(empty.trace == test_trace().expected_char_class(0, "byte").cancel());
+
+        for (auto i = 0; i < 256; ++i)
+            CHECK(check(rule, "any", "byte", i));
+    }
+    SUBCASE("range")
+    {
+        constexpr auto rule = dsl::byte.range<0x00, 0x10>();
+        CHECK(lexy::is_token_rule<decltype(rule)>);
+        CHECK(equivalent_rules(rule, dsl::bytes<1>.range<0x00, 0x10>()));
+
+        auto empty = LEXY_VERIFY(lexy::byte_encoding{});
+        CHECK(empty.status == test_result::fatal_error);
+        CHECK(empty.trace == test_trace().expected_char_class(0, "byte.range").cancel());
+
+        for (auto i = 0x00; i <= 0x10; ++i)
+            CHECK(check(rule, "token", "byte.range", i));
+        for (auto i = 0x11; i < 256; ++i)
+            CHECK(!check(rule, "token", "byte.range", i));
+    }
+    SUBCASE("set")
+    {
+        constexpr auto rule = dsl::byte.set<0x0, 0x1, 0x2, 0x3>();
+        CHECK(lexy::is_token_rule<decltype(rule)>);
+        CHECK(equivalent_rules(rule, dsl::bytes<1>.set<0x0, 0x1, 0x2, 0x3>()));
+
+        auto empty = LEXY_VERIFY(lexy::byte_encoding{});
+        CHECK(empty.status == test_result::fatal_error);
+        CHECK(empty.trace == test_trace().expected_char_class(0, "byte.set").cancel());
+
+        for (auto i = 0x0; i <= 0x3; ++i)
+            CHECK(check(rule, "token", "byte.set", i));
+        for (auto i = 0x4; i < 256; ++i)
+            CHECK(!check(rule, "token", "byte.set", i));
+    }
+    SUBCASE("ascii")
+    {
+        constexpr auto rule = dsl::byte.ascii();
+        CHECK(lexy::is_token_rule<decltype(rule)>);
+        CHECK(equivalent_rules(rule, dsl::bytes<1>.ascii()));
+
+        auto empty = LEXY_VERIFY(lexy::byte_encoding{});
+        CHECK(empty.status == test_result::fatal_error);
+        CHECK(empty.trace == test_trace().expected_char_class(0, "byte.ASCII").cancel());
+
+        for (auto i = 0x00; i <= 0x7F; ++i)
+            CHECK(check(rule, "token", "byte.ASCII", i));
+        for (auto i = 0x80; i < 256; ++i)
+            CHECK(!check(rule, "token", "byte.ASCII", i));
+    }
 }
 
 TEST_CASE("dsl::bytes")
@@ -308,4 +381,3 @@ TEST_CASE("dsl::bint")
         }
     }
 }
-