Skip to content

Commit

Permalink
Add lexy::dsl::byte.if_/set/range/ascii methods to match spec…
Browse files Browse the repository at this point in the history
…ific bytes

Fixes #168.
  • Loading branch information
foonathan committed Sep 26, 2023
1 parent 0c2f487 commit bb36367
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

* Change `lexy::dsl::try_()` error recovery behavior:
It will now skip whitespace after the (optional) error recovery rule.
* Add `lexy::dsl::byte.if_`/`set`/`range`/`ascii` methods to match specific bytes.
* Add an overload of `fatal_error()` on scanners that allow construction of type-erased generic errors (#134).
* Add `lexy::buffer::release()` and `lexy::buffer::adopt()`.
* Add default argument to `lexy::dsl::flag()`.
Expand Down
39 changes: 32 additions & 7 deletions docs/content/reference/dsl/byte.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,51 @@ Rules that match one or more bytes.
----
namespace lexy::dsl
{
class _bytes-dsl_ // models _token-rule_
{
public:
template <typename Predicate>
constexpr _token-rule_ auto if_() const;
template <unsigned char ... Bytes>
constexpr _token-rule_ auto set() const;
template <unsigned char Low, unsigned char High>
constexpr _token-rule_ auto range() const;
constexpr _token-rule_ auto ascii() const;
};
template <std::size_t N>
constexpr _token-rule_ auto bytes;
constexpr _bytes-dsl auto bytes;
constexpr _token-rule_ auto byte = bytes<1>;
constexpr _bytes-dsl_ auto byte = bytes<1>;
}
----

[.lead]
`bytes` is a {{% token-rule %}} that matches `N` arbitrary bytes.
`bytes` is a {{% token-rule %}} that matches `N` bytes from a specified set.

Requires::
The input {{% encoding %}} is `lexy::byte_encoding`.
Matching::
Matches and consumes `N` bytes.
Matches and consumes `N` bytes:
* By default, it matches arbitrary bytes in the range `[0x00, 0xFF]` (both sides inclusive).
Its name is `byte`.
* `.if_()`: matches bytes where the predicate returns true.
It must have a `constexpr bool operator()(unsigned char)`.
Its name is the type name of `Predicate`.
* `.set()`: matches the specified bytes.
Its name is `byte.set`.
* `.range()`: matches the bytes in the range `[Low, High]` (both sides inclusive).
Its name is `byte.range`.
* `.ascii()`: matches bytes that are also ASCII characters; i.e. in the range `[0x00, 0x7F]` (both sides inclusive).
Its name is `byte.ASCII`.
Errors::
{{% docref "lexy::expected_char_class" %}} (`"byte"`):
if EOF was encountered early; at the position of the last byte it could consume.
{{% docref "lexy::expected_char_class" %}} (with the name as above):
if a mismatched byte or EOF was encountered early; at the position of the last byte it could consume.
The rule then fails.
Parse tree::
Single token node with the {{% docref "lexy::predefined_token_kind" %}} `lexy::any_token_kind`.
Single token node with the {{% docref "lexy::predefined_token_kind" %}} `lexy::any_token_kind` (default) or `lexy::unknown_token_kind` (with predicate/range).

CAUTION: Combining {{% docref "lexy::dsl::capture" %}} with `bytes` does not do any endianness conversion.

Expand Down
116 changes: 100 additions & 16 deletions include/lexy/dsl/byte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,36 @@
#include <cstdint>
#include <lexy/_detail/integer_sequence.hpp>
#include <lexy/dsl/base.hpp>
#include <lexy/dsl/char_class.hpp>
#include <lexy/dsl/token.hpp>

//=== byte ===//
namespace lexyd
{
template <std::size_t N>
struct _b : token_base<_b<N>>
template <std::size_t N, typename Predicate>
struct _b : token_base<_b<N, Predicate>>
{
static_assert(N > 0);

static constexpr bool _match(lexy::byte_encoding::int_type cur)
{
if (cur == lexy::byte_encoding::eof())
return false;

if constexpr (!std::is_void_v<Predicate>)
{
constexpr auto predicate = Predicate{};
return predicate(static_cast<lexy::byte_encoding::char_type>(cur));
}
else
{
return true;
}
}

template <typename Reader, typename Indices = lexy::_detail::make_index_sequence<N>>
struct tp;

template <typename Reader, std::size_t... Idx>
struct tp<Reader, lexy::_detail::index_sequence<Idx...>>
{
Expand All @@ -31,34 +49,99 @@ struct _b : token_base<_b<N>>
static_assert(std::is_same_v<typename Reader::encoding, lexy::byte_encoding>);

// Bump N times.
auto result = ((reader.peek() == Reader::encoding::eof() ? ((void)Idx, false)
: (reader.bump(), true))
&& ...);
end = reader.position();
auto result
= ((_match(reader.peek()) ? (reader.bump(), true) : ((void)Idx, false)) && ...);
end = reader.position();
return result;
}

template <typename Context>
constexpr void report_error(Context& context, const Reader&)
{
auto err = lexy::error<Reader, lexy::expected_char_class>(end, "byte");
constexpr auto name
= std::is_void_v<Predicate> ? "byte" : lexy::_detail::type_name<Predicate>();
auto err = lexy::error<Reader, lexy::expected_char_class>(end, name);
context.on(_ev::error{}, err);
}
};

//=== dsl ===//
template <typename P>
constexpr auto if_() const
{
static_assert(std::is_void_v<Predicate>);
return _b<N, P>{};
}

template <unsigned char Low, unsigned char High>
constexpr auto range() const
{
struct predicate
{
static LEXY_CONSTEVAL auto name()
{
return "byte.range";
}

constexpr bool operator()(unsigned char byte) const
{
return Low <= byte && byte <= High;
}
};

return if_<predicate>();
}

template <unsigned char... Bytes>
constexpr auto set() const
{
struct predicate
{
static LEXY_CONSTEVAL auto name()
{
return "byte.set";
}

constexpr bool operator()(unsigned char byte) const
{
return ((byte == Bytes) || ...);
}
};

return if_<predicate>();
}

constexpr auto ascii() const
{
struct predicate
{
static LEXY_CONSTEVAL auto name()
{
return "byte.ASCII";
}

constexpr bool operator()(unsigned char byte) const
{
return byte <= 0x7F;
}
};

return if_<predicate>();
}
};

/// Matches an arbitrary byte.
constexpr auto byte = _b<1>{};
constexpr auto byte = _b<1, void>{};

/// Matches N arbitrary bytes.
template <std::size_t N>
constexpr auto bytes = _b<N>{};
constexpr auto bytes = _b<N, void>{};
} // namespace lexyd

namespace lexy
{
template <std::size_t N>
constexpr auto token_kind_of<lexy::dsl::_b<N>> = lexy::any_token_kind;
constexpr auto token_kind_of<lexy::dsl::_b<N, void>> = lexy::any_token_kind;
} // namespace lexy

//=== padding bytes ===//
Expand Down Expand Up @@ -88,9 +171,9 @@ struct _pb : branch_base

constexpr auto try_parse(const void*, const Reader& reader)
{
lexy::token_parser_for<_b<N>, Reader> parser(reader);
auto result = parser.try_parse(reader);
end = parser.end;
lexy::token_parser_for<_b<N, void>, Reader> parser(reader);
auto result = parser.try_parse(reader);
end = parser.end;
return result;
}

Expand Down Expand Up @@ -118,7 +201,7 @@ struct _pb : branch_base
LEXY_PARSER_FUNC static bool parse(Context& context, Reader& reader, Args&&... args)
{
auto begin = reader.position();
if (!_b<N>::token_parse(context, reader))
if (!_b<N, void>::token_parse(context, reader))
return false;
auto end = reader.position();

Expand Down Expand Up @@ -167,6 +250,7 @@ auto _bint()
return 0;
}
}

template <std::size_t N>
using bint = decltype(_bint<N>());
} // namespace lexy::_detail
Expand All @@ -187,10 +271,11 @@ namespace lexyd
template <std::size_t N, int Endianness, typename Rule = void>
struct _bint : branch_base
{
using _rule = lexy::_detail::type_or<Rule, _b<N>>;
using _rule = lexy::_detail::type_or<Rule, _b<N, void>>;

template <typename NextParser, typename Indices = lexy::_detail::make_index_sequence<N>>
struct _pc;

template <typename NextParser, std::size_t... Idx>
struct _pc<NextParser, lexy::_detail::index_sequence<Idx...>>
{
Expand Down Expand Up @@ -308,4 +393,3 @@ inline constexpr auto big_bint64 = _bint<8, lexy::_detail::bint_big>{};
} // namespace lexyd

#endif // LEXY_DSL_BYTE_HPP_INCLUDED

80 changes: 76 additions & 4 deletions tests/lexy/dsl/byte.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,82 @@

TEST_CASE("dsl::byte")
{
constexpr auto rule = dsl::byte;
CHECK(lexy::is_token_rule<decltype(rule)>);
CHECK(equivalent_rules(rule, dsl::bytes<1>));
static constexpr auto callback = token_callback;

auto check = [](auto rule, const char* kind, const char* name, int byte) {
auto result = LEXY_VERIFY_RUNTIME(lexy::byte_encoding{}, static_cast<unsigned char>(byte));
if (result.status == test_result::fatal_error)
{
CHECK(result.trace == test_trace().expected_char_class(0, name).cancel());
return false;
}
else
{
char spelling[10];
std::sprintf(spelling, "\\%02X", byte);
CHECK(result.trace == test_trace().token(kind, spelling));
return true;
}
};

SUBCASE("any")
{
constexpr auto rule = dsl::byte;
CHECK(lexy::is_token_rule<decltype(rule)>);
CHECK(equivalent_rules(rule, dsl::bytes<1>));

auto empty = LEXY_VERIFY(lexy::byte_encoding{});
CHECK(empty.status == test_result::fatal_error);
CHECK(empty.trace == test_trace().expected_char_class(0, "byte").cancel());

for (auto i = 0; i < 256; ++i)
CHECK(check(rule, "any", "byte", i));
}
SUBCASE("range")
{
constexpr auto rule = dsl::byte.range<0x00, 0x10>();
CHECK(lexy::is_token_rule<decltype(rule)>);
CHECK(equivalent_rules(rule, dsl::bytes<1>.range<0x00, 0x10>()));

auto empty = LEXY_VERIFY(lexy::byte_encoding{});
CHECK(empty.status == test_result::fatal_error);
CHECK(empty.trace == test_trace().expected_char_class(0, "byte.range").cancel());

for (auto i = 0x00; i <= 0x10; ++i)
CHECK(check(rule, "token", "byte.range", i));
for (auto i = 0x11; i < 256; ++i)
CHECK(!check(rule, "token", "byte.range", i));
}
SUBCASE("set")
{
constexpr auto rule = dsl::byte.set<0x0, 0x1, 0x2, 0x3>();
CHECK(lexy::is_token_rule<decltype(rule)>);
CHECK(equivalent_rules(rule, dsl::bytes<1>.set<0x0, 0x1, 0x2, 0x3>()));

auto empty = LEXY_VERIFY(lexy::byte_encoding{});
CHECK(empty.status == test_result::fatal_error);
CHECK(empty.trace == test_trace().expected_char_class(0, "byte.set").cancel());

for (auto i = 0x0; i <= 0x3; ++i)
CHECK(check(rule, "token", "byte.set", i));
for (auto i = 0x4; i < 256; ++i)
CHECK(!check(rule, "token", "byte.set", i));
}
SUBCASE("ascii")
{
constexpr auto rule = dsl::byte.ascii();
CHECK(lexy::is_token_rule<decltype(rule)>);
CHECK(equivalent_rules(rule, dsl::bytes<1>.ascii()));

auto empty = LEXY_VERIFY(lexy::byte_encoding{});
CHECK(empty.status == test_result::fatal_error);
CHECK(empty.trace == test_trace().expected_char_class(0, "byte.ASCII").cancel());

for (auto i = 0x00; i <= 0x7F; ++i)
CHECK(check(rule, "token", "byte.ASCII", i));
for (auto i = 0x80; i < 256; ++i)
CHECK(!check(rule, "token", "byte.ASCII", i));
}
}

TEST_CASE("dsl::bytes")
Expand Down Expand Up @@ -308,4 +381,3 @@ TEST_CASE("dsl::bint")
}
}
}

0 comments on commit bb36367

Please sign in to comment.