diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 83df6c4a..0125cca6 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -10,7 +10,7 @@ jobs: strategy: fail-fast: false matrix: - format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] compiler: [llvm, gcc] compiler-version: [11, 12, 13, 14, 16, 17, 18] cxx: [20, 23] diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 60f3f0aa..2b34b06c 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: os: ["macos-latest", "macos-13"] - format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] name: "${{ matrix.os }} (${{ matrix.format }})" runs-on: ${{ matrix.os }} steps: @@ -28,6 +28,8 @@ jobs: core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); - name: Run vcpkg uses: lukka/run-vcpkg@v11 + - name: Install bison + run: brew install bison - name: Install ninja run: brew install ninja if: matrix.os == 'macos-latest' diff --git a/.github/workflows/windows.yaml b/.github/workflows/windows.yaml index 1727de4e..2445f3ce 100644 --- a/.github/workflows/windows.yaml +++ b/.github/workflows/windows.yaml @@ -10,7 +10,7 @@ jobs: strategy: fail-fast: false matrix: - format: ["JSON", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] name: "windows-msvc (${{ matrix.format }})" runs-on: windows-latest steps: diff --git a/.gitignore b/.gitignore index 13abfb6a..1b782f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ *.fb *.flexbuf *.msgpack +*.parquet *.toml *.ubjson *.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3eec0f..e811cb42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,7 @@ option(REFLECTCPP_CAPNPROTO "Enable Cap’n Proto support" ${REFLECTCPP_ALL_FORM option(REFLECTCPP_CBOR "Enable CBOR support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_FLEXBUFFERS "Enable flexbuffers support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_MSGPACK "Enable msgpack support" ${REFLECTCPP_ALL_FORMATS}) +option(REFLECTCPP_PARQUET "Enable parquet support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_XML "Enable XML support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_TOML "Enable TOML support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_UBJSON "Enable UBJSON support" ${REFLECTCPP_ALL_FORMATS}) @@ -55,7 +56,8 @@ endif() if (REFLECTCPP_BUILD_TESTS OR REFLECTCPP_BUILD_BENCHMARKS OR (REFLECTCPP_JSON AND NOT REFLECTCPP_USE_BUNDLED_DEPENDENCIES) OR REFLECTCPP_AVRO OR REFLECTCPP_BSON OR REFLECTCPP_CAPNPROTO OR REFLECTCPP_CBOR OR REFLECTCPP_FLEXBUFFERS OR - REFLECTCPP_MSGPACK OR REFLECTCPP_XML OR REFLECTCPP_TOML OR REFLECTCPP_UBJSON OR REFLECTCPP_YAML) + REFLECTCPP_MSGPACK OR REFLECTCPP_PARQUET OR REFLECTCPP_XML OR + REFLECTCPP_TOML OR REFLECTCPP_UBJSON OR REFLECTCPP_YAML) # enable vcpkg per default if features other than JSON are required set(REFLECTCPP_USE_VCPKG_DEFAULT ON) endif() @@ -109,6 +111,10 @@ if (REFLECTCPP_USE_VCPKG) list(APPEND VCPKG_MANIFEST_FEATURES "msgpack") endif() + if (REFLECTCPP_PARQUET) + list(APPEND VCPKG_MANIFEST_FEATURES "parquet") + endif() + if (REFLECTCPP_TOML) list(APPEND VCPKG_MANIFEST_FEATURES "toml") endif() @@ -260,6 +266,17 @@ if (REFLECTCPP_MSGPACK) target_link_libraries(reflectcpp PUBLIC msgpack-c) endif () +if (REFLECTCPP_PARQUET) + if (NOT TARGET Arrow) + find_package(Arrow CONFIG REQUIRED) + endif() + if (NOT TARGET Parquet) + find_package(Parquet CONFIG REQUIRED) + endif() + target_link_libraries(reflectcpp PUBLIC "$,Arrow::arrow_static,Arrow::arrow_shared>") + target_link_libraries(reflectcpp PUBLIC "$,Parquet::parquet_static,Parquet::parquet_shared>") +endif () + if (REFLECTCPP_TOML) list(APPEND REFLECT_CPP_SOURCES src/reflectcpp_toml.cpp diff --git a/README.md b/README.md index 68b09aeb..c32de0b7 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,6 @@ reflect-cpp and sqlgen fill important gaps in C++ development. They reduce boile
- ## Table of Contents ### On this page @@ -38,6 +37,7 @@ reflect-cpp and sqlgen fill important gaps in C++ development. They reduce boile - [Feature Overview](#feature-overview) - [Simple Example](#simple-example) - [More Comprehensive Example](#more-comprehensive-example) + - [Tabular data](#tabular-data) - [Error messages](#error-messages) - [JSON schema](#json-schema) - [Enums](#enums) @@ -70,6 +70,7 @@ The following table lists the serialization formats currently supported by refle | CBOR | [jsoncons](https://github.com/danielaparker/jsoncons)| >= 0.176.0 | BSL 1.0 | JSON-like binary format | | flexbuffers | [flatbuffers](https://github.com/google/flatbuffers) | >= 23.5.26 | Apache 2.0 | Schema-less version of flatbuffers, binary format | | msgpack | [msgpack-c](https://github.com/msgpack/msgpack-c) | >= 6.0.0 | BSL 1.0 | JSON-like binary format | +| parquet | [Apache Arrow](https://arrow.apache.org/) | >= 21.0.0 | Apache 2.0 | Tabular binary format | | TOML | [toml++](https://github.com/marzer/tomlplusplus) | >= 3.4.0 | MIT | Textual format with an emphasis on readability | | UBJSON | [jsoncons](https://github.com/danielaparker/jsoncons)| >= 0.176.0 | BSL 1.0 | JSON-like binary format | | XML | [pugixml](https://github.com/zeux/pugixml) | >= 1.14 | MIT | Textual format used in many legacy projects | @@ -145,7 +146,7 @@ age: 45 ``` This will work for just about any example in the entire documentation -and any supported format, except where explicitly noted otherwise: +and any of the following formats, except where explicitly noted otherwise: ```cpp rfl::avro::write(homer); @@ -242,6 +243,34 @@ std::cout << "Hello, my name is " << homer2.first_name() << " " << homer2.last_name() << "." << std::endl; ``` +### Tabular data + +reflect-cpp also supports tabular data formats, like Parquet: + +```cpp +#include + +const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + +const auto bytestring = rfl::parquet::write(people); +``` + ### Error messages reflect-cpp returns clear and comprehensive error messages: diff --git a/docs/supported_formats/parquet.md b/docs/supported_formats/parquet.md new file mode 100644 index 00000000..f259b3ce --- /dev/null +++ b/docs/supported_formats/parquet.md @@ -0,0 +1,265 @@ +For Parquet support, you must also include the header `` and link to the [Apache Arrow](https://arrow.apache.org/) and [Apache Parquet](https://parquet.apache.org/) libraries. +Furthermore, when compiling reflect-cpp, you need to pass `-DREFLECTCPP_PARQUET=ON` to cmake. + +Parquet is a columnar storage format optimized for analytical workloads. Unlike most other formats supported by reflect-cpp, Parquet is designed for tabular data and has specific limitations regarding nested structures. + +## Reading and writing + +Suppose you have a struct like this: + +```cpp +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + unsigned int age; + rfl::Email email; +}; +``` + +**Important**: Parquet is a tabular format that requires collections of records. You cannot serialize individual structs - you must use containers like `std::vector`, `std::deque`, etc. + +A collection of `Person` structs can be serialized to a bytes vector like this: + +```cpp +const auto people = std::vector{ + Person{.first_name = "Bart", .birthday = "1987-04-19", .age = 10, .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", .birthday = "1987-04-19", .age = 8, .email = "lisa@simpson.com"} +}; +const std::vector bytes = rfl::parquet::write(people); +``` + +You can parse bytes like this: + +```cpp +const rfl::Result> result = rfl::parquet::read>(bytes); +``` + +## Settings and compression + +Parquet supports various compression algorithms and chunk sizes. You can configure these using the `Settings` struct: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP) + .with_chunksize(1000); + +const std::vector bytes = rfl::parquet::write(people, settings); +``` + +Available compression options include: + +- `UNCOMPRESSED` - No compression, fastest read/write but largest file size +- `SNAPPY` (default) - Fast compression/decompression, good balance of speed and size +- `GZIP` - Good compression ratio, slower than Snappy but better compression +- `BROTLI` - Good compression for text data, optimized for web content +- `ZSTD` - Excellent compression ratio, modern algorithm with good speed +- `LZ4` - Very fast compression/decompression, lower compression ratio +- `LZ4_FRAME` - LZ4 with frame format, better compatibility +- `LZO` - Fast compression, older algorithm +- `BZ2` - High compression ratio, slower compression/decompression +- `LZ4_HADOOP` - LZ4 optimized for Hadoop ecosystem + +```cpp +// Examples of different compression settings +const auto snappy_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::SNAPPY); + +const auto gzip_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); + +const auto zstd_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::ZSTD); + +const auto uncompressed_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::UNCOMPRESSED); +``` + +## Loading and saving + +You can also load and save to disk using a very similar syntax: + +```cpp +const rfl::Result> result = rfl::parquet::load>("/path/to/file.parquet"); + +const auto people = std::vector{...}; +rfl::parquet::save("/path/to/file.parquet", people); +``` + +With custom settings: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); +rfl::parquet::save("/path/to/file.parquet", people, settings); +``` + +## Reading from and writing into streams + +You can also read from and write into any `std::istream` and `std::ostream` respectively. + +```cpp +const rfl::Result> result = rfl::parquet::read>(my_istream); + +const auto people = std::vector{...}; +rfl::parquet::write(people, my_ostream); +``` + +With custom settings: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); +rfl::parquet::write(people, my_ostream, settings); +``` + +## Field name transformations + +Like other formats, Parquet supports field name transformations. You can use processors like `SnakeCaseToCamelCase`: + +```cpp +const auto people = std::vector{...}; +const auto result = rfl::parquet::read, rfl::SnakeCaseToCamelCase>(bytes); +``` + +This will automatically convert field names from snake_case to camelCase during serialization and deserialization. + +## Supported processors + +The following processors are **NOT supported** and will cause compilation errors: + +- `rfl::AddTagsToVariants` - Cannot be used for tabular data +- `rfl::NoOptionals` - Cannot be used for tabular data +- `rfl::DefaultIfMissing` - Cannot be used for tabular data +- `rfl::NoExtraFields` - Cannot be used for tabular data +- `rfl::NoFieldNames` - Cannot be used for tabular data + +```cpp +// ✅ This works +const auto result = rfl::parquet::read, rfl::SnakeCaseToCamelCase>(bytes); + +// ❌ This will cause compilation errors +const auto result = rfl::parquet::read, rfl::AddTagsToVariants>(bytes); +const auto result = rfl::parquet::read, rfl::NoOptionals>(bytes); +const auto result = rfl::parquet::read, rfl::DefaultIfMissing>(bytes); +``` + +## Enums and validation + +Parquet supports enums and validated types. Enums are stored as strings: + +```cpp +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + rfl::Validator, rfl::Maximum<130>> age; + rfl::Email email; +}; +``` + +## No variant types + +Parquet does not support variant types like `std::variant`, `rfl::Variant`, or `rfl::TaggedUnion`. These types cannot be serialized to Parquet format. + +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::variant status; // Variant - not supported + rfl::Variant type; // rfl::Variant - not supported + rfl::TaggedUnion<"type", std::string, int> category; // TaggedUnion - not supported +}; +``` + +## Limitations of tabular formats + +Parquet, like other tabular formats, has specific limitations that differ from hierarchical formats like JSON or XML: + +### No nested objects +Unlike JSON or XML, Parquet cannot directly represent nested objects within a single row. Each field must be a primitive type, enum, or a simple container of primitives. + +```cpp +// This works fine +struct Person { + std::string first_name; + std::string last_name; + unsigned int age; +}; + +// This would NOT work as expected - nested objects are not automatically flattened +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + Address address; // ❌ This will cause compilation errors +}; +``` + +### Using rfl::Flatten for nested objects + +If you need to include nested objects, you can use `rfl::Flatten` to explicitly flatten them: + +```cpp +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + rfl::Flatten
address; // ✅ This will flatten the Address fields +}; + +// The resulting Parquet file will have columns: first_name, last_name, street, city +``` + +### Collections requirement + +You must serialize collections, not individual objects: +```cpp +std::vector people = {...}; // ✅ Correct +Person person = {...}; // ❌ Wrong - must be in a container +``` + +### No arrays (except bytestrings) +Parquet does not support arrays of any type except for binary data (bytestrings). This includes arrays of primitive types, strings, and objects. + +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::vector hobbies; // Array of strings - not supported + std::vector scores; // Array of integers - not supported + std::vector
addresses; // Array of objects - not supported +}; + +// ✅ This works +struct Person { + std::string first_name; + std::string last_name; + std::vector binary_data; // Binary data - supported as bytestring +}; +``` +### Use cases +Parquet is ideal for: +- Data warehousing and analytics +- Large datasets with repeated values +- Integration with big data tools (Spark, Hadoop, etc.) +- Simple, flat data structures with consistent types + +Parquet is less suitable for: +- Complex nested data structures +- Data with arrays or variant types +- Frequent schema changes +- Row-oriented access patterns +- Small datasets where the overhead isn't justified +- Data with complex object hierarchies + diff --git a/include/rfl.hpp b/include/rfl.hpp index ac5e7eac..4753749c 100644 --- a/include/rfl.hpp +++ b/include/rfl.hpp @@ -76,6 +76,7 @@ #include "rfl/to_view.hpp" #include "rfl/tuple_cat.hpp" #include "rfl/type_name_t.hpp" +#include "rfl/view_t.hpp" #include "rfl/visit.hpp" #ifdef _MSC_VER diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index d5302683..9909c0f0 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -37,6 +37,15 @@ class Timestamp { Timestamp(const std::tm& _tm) : tm_(_tm) {} + Timestamp(const time_t _t) : tm_(std::tm{}) { + auto t = _t; +#if defined(_MSC_VER) || defined(__MINGW32__) + gmtime_s(&tm_, &t); +#else + gmtime_r(&t, &tm_); +#endif + } + ~Timestamp() = default; /// Returns a result containing the timestamp when successful or an Error @@ -71,6 +80,16 @@ class Timestamp { /// Trivial (const) accessor to the underlying time stamp. const std::tm& tm() const { return tm_; } + /// Returns a UTC time represented by a time_t type. + time_t to_time_t() const { + auto tm = tm_; +#if defined(_MSC_VER) || defined(__MINGW32__) + return _mkgmtime(&tm); +#else + return static_cast(timegm(&tm) - tm_.tm_gmtoff); +#endif + } + private: #if defined(_MSC_VER) || defined(__MINGW32__) // This workaround is necessary, because strptime is not available on Windows. diff --git a/include/rfl/internal/has_reflection_method_v.hpp b/include/rfl/internal/has_reflection_method_v.hpp index 228239a5..963253b1 100644 --- a/include/rfl/internal/has_reflection_method_v.hpp +++ b/include/rfl/internal/has_reflection_method_v.hpp @@ -1,29 +1,15 @@ #ifndef RFL_INTERNAL_HASREFLECTIONMETHODV_HPP_ #define RFL_INTERNAL_HASREFLECTIONMETHODV_HPP_ -#include +#include -namespace rfl { -namespace internal { +namespace rfl::internal { -template -using reflection_method_t = - decltype(std::declval().reflection()); +template +constexpr bool has_reflection_method_v = requires(T t) { + { t.reflection() } -> std::convertible_to; +}; -template > -struct has_refl_m : std::false_type {}; - -template -struct has_refl_m>> - : std::true_type {}; - -/// Utility parameter for named tuple parsing, can be used by the -/// parsers to determine whether a class or struct has a method -/// called "reflection". -template -constexpr bool has_reflection_method_v = has_refl_m::value; - -} // namespace internal -} // namespace rfl +} // namespace rfl::internal #endif diff --git a/include/rfl/internal/has_reflection_type_v.hpp b/include/rfl/internal/has_reflection_type_v.hpp index f6cd70a9..c6020138 100644 --- a/include/rfl/internal/has_reflection_type_v.hpp +++ b/include/rfl/internal/has_reflection_type_v.hpp @@ -1,33 +1,20 @@ #ifndef RFL_HASREFLECTIONTYPEV_HPP_ #define RFL_HASREFLECTIONTYPEV_HPP_ -#include -#include +#include -namespace rfl { -namespace internal { +namespace rfl::internal { -template -class HasReflectionType { - private: - template - static std::int64_t foo(...); +template +struct ReflectionTypeWrapper {}; - template - static std::int32_t foo(typename U::ReflectionType*); - - public: - static constexpr bool value = - sizeof(foo(nullptr)) == sizeof(std::int32_t); +template +constexpr bool has_reflection_type_v = requires() { + { + ReflectionTypeWrapper{} + } -> std::same_as>; }; -/// Utility parameter for named tuple parsing, can be used by the -/// parsers to determine whether a class or struct defines a type -/// called "ReflectionType". -template -constexpr bool has_reflection_type_v = HasReflectionType::value; - -} // namespace internal -} // namespace rfl +} // namespace rfl::internal #endif // RFL_HASNAMEDTUPLETYPEV_HPP_ diff --git a/include/rfl/internal/ptr_named_tuple_t.hpp b/include/rfl/internal/ptr_named_tuple_t.hpp index c354e9be..7b7544b5 100644 --- a/include/rfl/internal/ptr_named_tuple_t.hpp +++ b/include/rfl/internal/ptr_named_tuple_t.hpp @@ -13,7 +13,7 @@ namespace internal { template using ptr_named_tuple_t = - typename std::invoke_result), T>::type; + std::invoke_result_t), T>; } // namespace internal } // namespace rfl diff --git a/include/rfl/named_tuple_t.hpp b/include/rfl/named_tuple_t.hpp index 4cb7786a..8e31100b 100644 --- a/include/rfl/named_tuple_t.hpp +++ b/include/rfl/named_tuple_t.hpp @@ -6,18 +6,19 @@ #include #include "NamedTuple.hpp" -#include "internal/ptr_named_tuple_t.hpp" +#include "Processors.hpp" #include "internal/remove_ptrs_nt.hpp" #include "to_named_tuple.hpp" +#include "view_t.hpp" namespace rfl { /// Generates the named tuple that is equivalent to the struct T. /// This is the result you would expect from calling to_named_tuple(my_struct). /// All fields of the struct must be an rfl::Field. -template -using named_tuple_t = typename internal::remove_ptrs_nt< - internal::ptr_named_tuple_t>::NamedTupleType; +template +using named_tuple_t = + typename internal::remove_ptrs_nt>::NamedTupleType; } // namespace rfl diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp new file mode 100644 index 00000000..6d7b847a --- /dev/null +++ b/include/rfl/parquet.hpp @@ -0,0 +1,10 @@ +#ifndef RFL_PARQUET_HPP_ +#define RFL_PARQUET_HPP_ + +#include "../rfl.hpp" +#include "parquet/load.hpp" +#include "parquet/read.hpp" +#include "parquet/save.hpp" +#include "parquet/write.hpp" + +#endif diff --git a/include/rfl/parquet/Settings.hpp b/include/rfl/parquet/Settings.hpp new file mode 100644 index 00000000..2ba65def --- /dev/null +++ b/include/rfl/parquet/Settings.hpp @@ -0,0 +1,32 @@ +#ifndef RFL_PARQUET_SETTINGS_HPP_ +#define RFL_PARQUET_SETTINGS_HPP_ + +#include +#include + +#include "../Field.hpp" +#include "../replace.hpp" + +namespace rfl::parquet { + +using Compression = arrow::Compression::type; + +struct Settings { + /// The size of the chunks of the parquet file. + size_t chunksize = 2000; + + /// The compression algorithm used to compress the parquet file. + Compression compression = Compression::SNAPPY; + + Settings with_chunksize(const size_t _chunksize) const noexcept { + return replace(*this, make_field<"chunksize">(_chunksize)); + } + + Settings with_compression(const Compression _compression) const noexcept { + return replace(*this, make_field<"compression">(_compression)); + } +}; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/load.hpp b/include/rfl/parquet/load.hpp new file mode 100644 index 00000000..e8802ab5 --- /dev/null +++ b/include/rfl/parquet/load.hpp @@ -0,0 +1,20 @@ +#ifndef RFL_PARQUET_LOAD_HPP_ +#define RFL_PARQUET_LOAD_HPP_ + +#include "../Result.hpp" +#include "../io/load_bytes.hpp" +#include "read.hpp" + +namespace rfl::parquet { + +template +Result load(const std::string& _fname) { + const auto read_bytes = [](const auto& _bytes) { + return read(_bytes); + }; + return rfl::io::load_bytes(_fname).and_then(read_bytes); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp new file mode 100644 index 00000000..68ff2e76 --- /dev/null +++ b/include/rfl/parquet/read.hpp @@ -0,0 +1,67 @@ +#ifndef RFL_PARQUET_READ_HPP_ +#define RFL_PARQUET_READ_HPP_ + +#include +#include + +#include +#include +#include + +#include "../Processors.hpp" +#include "../Result.hpp" +#include "../concepts.hpp" +#include "../internal/wrap_in_rfl_array_t.hpp" +#include "../parsing/tabular/ArrowReader.hpp" + +namespace rfl::parquet { + +/// Parses an object from PARQUET using reflection. +template +Result> read( + const concepts::ByteLike auto* _bytes, const size_t _size) { + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + const auto buffer = std::make_shared( + internal::ptr_cast(_bytes), _size); + + const auto input = std::make_shared(buffer); + + auto arrow_reader = ::parquet::arrow::OpenFile(input, pool); + + if (!arrow_reader.ok()) { + return error(std::string("Could not generate the arrow reader: ") + + arrow_reader.status().message()); + } + + std::shared_ptr table; + + const auto status = arrow_reader.ValueOrDie()->ReadTable(&table); + + if (!status.ok()) { + return error("Could not read table: " + status.message()); + } + + using ArrowReader = parsing::tabular::ArrowReader; + + return ArrowReader::make(table).and_then( + [](const auto& _r) { return _r.read(); }); +} + +/// Parses an object from PARQUET using reflection. +template +auto read(const concepts::ContiguousByteContainer auto& _bytes) { + return read(_bytes.data(), _bytes.size()); +} + +/// Parses an object from a stream. +template +auto read(std::istream& _stream) { + std::istreambuf_iterator begin(_stream), end; + auto bytes = std::vector(begin, end); + return read(bytes.data(), bytes.size()); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/save.hpp b/include/rfl/parquet/save.hpp new file mode 100644 index 00000000..092abe64 --- /dev/null +++ b/include/rfl/parquet/save.hpp @@ -0,0 +1,24 @@ +#ifndef RFL_PARQUET_SAVE_HPP_ +#define RFL_PARQUET_SAVE_HPP_ + +#include +#include +#include + +#include "../Result.hpp" +#include "../io/save_bytes.hpp" +#include "write.hpp" + +namespace rfl::parquet { + +template +Result save(const std::string& _fname, const auto& _obj) { + const auto write_func = [](const auto& _obj, auto& _stream) -> auto& { + return write(_obj, _stream); + }; + return rfl::io::save_bytes(_fname, _obj, write_func); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp new file mode 100644 index 00000000..0dd397af --- /dev/null +++ b/include/rfl/parquet/write.hpp @@ -0,0 +1,81 @@ +#ifndef RFL_PARQUET_WRITE_HPP_ +#define RFL_PARQUET_WRITE_HPP_ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../Processors.hpp" +#include "../Ref.hpp" +#include "../parsing/tabular/ArrowWriter.hpp" +#include "Settings.hpp" + +namespace rfl::parquet { + +/// Returns parquet bytes. +template +Ref to_buffer(const auto& _arr, const Settings& _settings) { + using T = std::remove_cvref_t; + + const auto table = + parsing::tabular::ArrowWriter(_settings.chunksize) + .to_table(_arr); + + const auto props = ::parquet::WriterProperties::Builder() + .compression(_settings.compression) + ->build(); + + const auto arrow_props = + ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); + + const auto output_buffer = arrow::io::BufferOutputStream::Create(); + + if (!output_buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + const auto status = ::parquet::arrow::WriteTable( + *table.get(), arrow::default_memory_pool(), output_buffer.ValueOrDie(), + _settings.chunksize, props, arrow_props); + + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + + const auto buffer = output_buffer.ValueOrDie()->Finish(); + + if (!buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + return Ref::make(buffer.ValueOrDie()).value(); +} + +/// Returns parquet bytes. +template +std::vector write(const auto& _arr, + const Settings& _settings = Settings{}) { + const auto buffer = to_buffer(_arr, _settings); + const auto view = std::string_view(*buffer); + return std::vector(view.begin(), view.end()); +} + +/// Writes a PARQUET into an ostream. +template +std::ostream& write(const auto& _arr, std::ostream& _stream, + const Settings& _settings = Settings{}) noexcept { + auto buffer = to_buffer(_arr, _settings); + _stream << std::string_view(*buffer); + return _stream; +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp new file mode 100644 index 00000000..153778b7 --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -0,0 +1,128 @@ +#ifndef RFL_PARSING_TABULAR_ARROWREADER_HPP_ +#define RFL_PARSING_TABULAR_ARROWREADER_HPP_ + +#include + +#include +#include +#include +#include +#include +#include + +#include "../../Processors.hpp" +#include "../../Result.hpp" +#include "../../Tuple.hpp" +#include "../../apply.hpp" +#include "../../get.hpp" +#include "../../named_tuple_t.hpp" +#include "../../to_view.hpp" +#include "../../view_t.hpp" +#include "../call_destructors_where_necessary.hpp" +#include "make_chunked_array_iterators.hpp" + +namespace rfl::parsing::tabular { + +template +class ArrowReader { + static_assert(!Processors::add_tags_to_variants_, + "rfl::AddTagsToVariants cannot be used for tabular data."); + static_assert(!Processors::all_required_, + "rfl::NoOptionals cannot be used for tabular data."); + static_assert(!Processors::default_if_missing_, + "rfl::DefaultIfMissing cannot be used for tabular data."); + static_assert(!Processors::no_extra_fields_, + "rfl::NoExtraFields cannot be used for tabular data (neither " + "can rfl::ExtraFields)."); + static_assert(!Processors::no_field_names_, + "rfl::NoFieldNames cannot be used for tabular data."); + + public: + using ValueType = typename std::remove_cvref_t; + + static Result make(const std::shared_ptr& _table) { + try { + return ArrowReader(_table); + } catch (const std::exception& e) { + return error(std::string("Could not create ArrowReader: ") + e.what()); + } + } + + ~ArrowReader() = default; + + Result read() const noexcept { + return make_chunked_array_iterators>(table_) + .and_then([&](auto chunked_array_iterators) -> Result { + VecType result; + while (!end(chunked_array_iterators)) { + auto value = new_value(&chunked_array_iterators); + if (!value) { + return error(value.error().what()); + } + result.emplace_back(std::move(*value)); + } + return result; + }); + } + + private: + ArrowReader(const std::shared_ptr& _table) + : table_(Ref::make(_table).value()) {} + + bool end(const auto& _chunked_array_iterators) const { + return apply( + [](const auto&... _its) { return (false || ... || _its.end()); }, + _chunked_array_iterators); + } + + Result new_value(auto* _chunked_array_iterators) const noexcept { + alignas(ValueType) unsigned char buf[sizeof(ValueType)]{}; + auto ptr = internal::ptr_cast(&buf); + auto view = to_view(*ptr); + using ViewType = std::remove_cvref_t; + try { + const auto set_one = [&](std::integral_constant) { + using FieldType = tuple_element_t<_i, typename ViewType::Fields>; + using T = std::remove_cvref_t< + std::remove_pointer_t>; + auto res = *_chunked_array_iterators->template get<_i>(); + if (!res) { + destroy_value<_i>(&view); + throw std::runtime_error( + std::string("Field '") + typename FieldType::Name().str() + + std::string("' could not be set: ") + res.error().what()); + } + ::new (view.template get<_i>()) T(std::move(*res)); + ++_chunked_array_iterators->template get<_i>(); + }; + + [&](std::integer_sequence) { + (set_one(std::integral_constant{}), ...); + }(std::make_integer_sequence()); + + return std::move(*ptr); + } catch (const std::exception& e) { + return error(e.what()); + } + } + + template + void destroy_value(ViewType* _view) const { + static_assert(_i < ViewType::size(), "_i out of bounds."); + auto set = std::array(); + for (size_t i = 0; i < _i; ++i) { + set[i] = true; + } + for (size_t i = _i; i < ViewType::size(); ++i) { + set[i] = false; + } + call_destructors_where_necessary(set, _view); + } + + private: + Ref table_; +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp new file mode 100644 index 00000000..2c7bee83 --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -0,0 +1,537 @@ +#ifndef RFL_PARSING_TABULAR_ARROWTYPES_HPP_ +#define RFL_PARSING_TABULAR_ARROWTYPES_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../Box.hpp" +#include "../../NamedTuple.hpp" +#include "../../Ref.hpp" +#include "../../Rename.hpp" +#include "../../Timestamp.hpp" +#include "../../Tuple.hpp" +#include "../../concepts.hpp" +#include "../../enums.hpp" +#include "../../internal/StringLiteral.hpp" +#include "../../internal/has_reflection_type_v.hpp" +#include "../../internal/ptr_cast.hpp" +#include "../../named_tuple_t.hpp" + +namespace rfl::parsing::tabular { + +template +struct ArrowTypes; + +template <> +struct ArrowTypes { + using ArrayType = arrow::BooleanArray; + using BuilderType = arrow::BooleanBuilder; + + static auto data_type() { return arrow::boolean(); } + + static void add_to_builder(const bool _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt8Array; + using BuilderType = arrow::UInt8Builder; + + static auto data_type() { return arrow::uint8(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt16Array; + using BuilderType = arrow::UInt16Builder; + + static auto data_type() { return arrow::uint16(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt32Array; + using BuilderType = arrow::UInt32Builder; + + static auto data_type() { return arrow::uint32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt64Array; + using BuilderType = arrow::UInt64Builder; + + static auto data_type() { return arrow::uint64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int8Array; + using BuilderType = arrow::Int8Builder; + + static auto data_type() { return arrow::int8(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int16Array; + using BuilderType = arrow::Int16Builder; + + static auto data_type() { return arrow::int16(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int32Array; + using BuilderType = arrow::Int32Builder; + + static auto data_type() { return arrow::int32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int64Array; + using BuilderType = arrow::Int64Builder; + + static auto data_type() { return arrow::int64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::FloatArray; + using BuilderType = arrow::FloatBuilder; + + static auto data_type() { return arrow::float32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::DoubleArray; + using BuilderType = arrow::DoubleBuilder; + + static auto data_type() { return arrow::float64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::StringArray; + using BuilderType = arrow::StringBuilder; + + static auto data_type() { return arrow::utf8(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return std::string(_chunk->Value(_ix)); + } + + static auto make_builder() { return BuilderType(); } +}; + +template + requires enchantum::Enum +struct ArrowTypes { + using ArrayType = arrow::StringArray; + using BuilderType = arrow::StringBuilder; + + static auto data_type() { return arrow::utf8(); } + + static void add_to_builder(const T _val, BuilderType* _builder) { + const auto status = _builder->Append(enum_to_string(_val)); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, const int64_t _ix) { + return string_to_enum(std::string(_chunk->Value(_ix))); + } + + static auto make_builder() { return BuilderType(); } +}; + +template + requires concepts::ContiguousByteContainer +struct ArrowTypes { + using ArrayType = arrow::BinaryArray; + using BuilderType = arrow::BinaryBuilder; + + static auto data_type() { return arrow::binary(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + const auto status = _builder->Append( + internal::ptr_cast(_val.data()), _val.size()); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, const int64_t _ix) { + const auto begin = internal::ptr_cast( + _chunk->Value(_ix).data()); + return T(begin, begin + _chunk->Value(_ix).size()); + } + + static auto make_builder() { return BuilderType(); } +}; + +template +struct ArrowTypes> { + using ArrayType = arrow::TimestampArray; + using BuilderType = arrow::TimestampBuilder; + + static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + const auto status = _builder->Append(_val.to_time_t()); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result> get_value(const Ref& _chunk, + const int64_t _ix) { + return Timestamp<_format>(_chunk->Value(_ix) / 1000); + } + + static auto make_builder() { + return BuilderType(data_type(), arrow::default_memory_pool()); + } +}; + +template + requires internal::has_reflection_type_v +struct ArrowTypes { + using ArrayType = typename ArrowTypes::ArrayType; + using BuilderType = + typename ArrowTypes::BuilderType; + + static auto data_type() { + return ArrowTypes::data_type(); + } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(_val.reflection(), + _builder); + } + + static Result get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>:: + get_value(_chunk, _ix) + .and_then([](const auto& _v) -> Result { + try { + return T(_v); + } catch (const std::exception& e) { + return error(e.what()); + } + }); + } + + static auto make_builder() { + return ArrowTypes::make_builder(); + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_optional(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_shared(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_unique(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(*_val, _builder); + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Box::make(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(*_val, _builder); + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Ref::make(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(_val.value(), _builder); + } + + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Rename<_name, T>(_v); }); + } + + static auto make_builder() { return ArrowTypes::make_builder(); } +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp new file mode 100644 index 00000000..5deae33d --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -0,0 +1,117 @@ +#ifndef RFL_PARSING_TABULAR_ARROWWRITER_HPP_ +#define RFL_PARSING_TABULAR_ARROWWRITER_HPP_ + +#include + +#include +#include +#include +#include + +#include "../../Processors.hpp" +#include "../../Tuple.hpp" +#include "../../get.hpp" +#include "../../named_tuple_t.hpp" +#include "../../to_view.hpp" +#include "add_to_builder.hpp" +#include "make_arrow_builders.hpp" +#include "make_arrow_data_types.hpp" +#include "make_arrow_schema.hpp" + +namespace rfl::parsing::tabular { + +template +class ArrowWriter { + static_assert(!Processors::add_tags_to_variants_, + "rfl::AddTagsToVariants cannot be used for tabular data."); + static_assert(!Processors::all_required_, + "rfl::NoOptionals cannot be used for tabular data."); + static_assert(!Processors::default_if_missing_, + "rfl::DefaultIfMissing cannot be used for tabular data."); + static_assert(!Processors::no_extra_fields_, + "rfl::NoExtraFields cannot be used for tabular data (neither " + "can rfl::ExtraFields)."); + static_assert(!Processors::no_field_names_, + "rfl::NoFieldNames cannot be used for tabular data."); + + public: + using ValueType = typename std::remove_cvref_t; + + ArrowWriter(const size_t _chunksize) : chunksize_(_chunksize) {} + + ~ArrowWriter() = default; + + std::shared_ptr to_table(const VecType& _data) const { + return arrow::Table::Make( + make_arrow_schema>(), + to_chunked_arrays(_data)); + } + + private: + std::vector> to_chunked_arrays( + const VecType& _data) const; + + private: + size_t chunksize_; +}; + +template +std::vector> +ArrowWriter::to_chunked_arrays(const VecType& _data) const { + using ValueType = typename VecType::value_type; + + auto builders = + make_arrow_builders>(); + + constexpr size_t size = tuple_size_v; + + std::vector>> array_chunks(size); + + auto it = _data.begin(); + + while (it != _data.end()) { + size_t i = 0; + + for (; it != _data.end() && (i < chunksize_ || chunksize_ == 0); + ++i, ++it) { + const auto view = to_view(*it); + + [&](const auto& _v, auto* _b, + std::integer_sequence) { + (add_to_builder(*get<_is>(_v), &(_b->template get<_is>())), ...); + }(view, &builders, std::make_integer_sequence()); + } + + if (i != 0) { + std::vector> chunks(size); + + const auto finish_builder = [](auto* _b, auto* _c) { + const auto status = _b->Finish(_c); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + }; + + [&](auto* _b, auto* _c, + std::integer_sequence) { + (finish_builder(&_b->template get<_is>(), &_c->at(_is)), ...); + }(&builders, &chunks, std::make_integer_sequence()); + + for (size_t j = 0; j < size; ++j) { + array_chunks.at(j).emplace_back(std::move(chunks.at(j))); + } + } + } + + const auto data_types = make_arrow_data_types(); + + return [&](std::integer_sequence) { + return std::vector>( + {std::make_shared(array_chunks.at(_is), + std::get<_is>(data_types))...}); + }(std::make_integer_sequence()); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp new file mode 100644 index 00000000..68b229bf --- /dev/null +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -0,0 +1,96 @@ +#ifndef RFL_PARSING_TABULAR_CHUNKEDARRAYITERATOR_HPP_ +#define RFL_PARSING_TABULAR_CHUNKEDARRAYITERATOR_HPP_ + +#include + +#include +#include +#include +#include + +#include "../../Ref.hpp" +#include "../../Result.hpp" +#include "../../internal/ptr_cast.hpp" +#include "../is_required.hpp" +#include "array_t.hpp" + +namespace rfl::parsing::tabular { + +template +class ChunkedArrayIterator { + public: + using difference_type = std::ptrdiff_t; + using value_type = Result; + + using ArrayType = array_t; + + static ChunkedArrayIterator make(const Ref& _arr) { + return ChunkedArrayIterator(_arr); + } + + ChunkedArrayIterator(const Ref& _arr) + : arr_(_arr), chunk_ix_(0), current_chunk_(get_chunk(arr_, 0)), ix_(0) {} + + ~ChunkedArrayIterator() = default; + + Result operator*() const noexcept { + const bool is_null = + current_chunk_ + .transform([&](const auto& _c) { return _c->IsNull(ix_); }) + .value_or(false); + + if (is_null) { + if constexpr (is_required()) { + return error("Value cannot be null."); + } else { + return T(); + } + } + + return current_chunk_.and_then( + [&](const auto& _c) { return ArrowTypes::get_value(_c, ix_); }); + } + + bool end() const noexcept { + return !current_chunk_ || (chunk_ix_ >= arr_->num_chunks()); + } + + ChunkedArrayIterator& operator++() noexcept { + if (!current_chunk_) { + return *this; + } + ++ix_; + if (ix_ >= (*current_chunk_)->length()) { + ++chunk_ix_; + current_chunk_ = get_chunk(arr_, chunk_ix_); + ix_ = 0; + } + return *this; + } + + void operator++(int) noexcept { ++*this; } + + private: + static Result> get_chunk(const Ref& _arr, + const int _chunk_ix) noexcept { + if (_chunk_ix < _arr->num_chunks()) { + return Ref::make( + std::static_pointer_cast(_arr->chunk(_chunk_ix))); + } else { + return error("chunk_ix out of bounds."); + } + } + + private: + Ref arr_; + + int chunk_ix_; + + Result> current_chunk_; + + int64_t ix_; +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/add_to_builder.hpp b/include/rfl/parsing/tabular/add_to_builder.hpp new file mode 100644 index 00000000..1d42a22f --- /dev/null +++ b/include/rfl/parsing/tabular/add_to_builder.hpp @@ -0,0 +1,18 @@ +#ifndef RFL_PARSING_TABULAR_ADD_TO_BUILDER_HPP_ +#define RFL_PARSING_TABULAR_ADD_TO_BUILDER_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "ArrowTypes.hpp" + +namespace rfl::parsing::tabular { + +template +inline void add_to_builder(const ValueType& _val, BuilderType* _builder) { + ArrowTypes>::add_to_builder(_val, _builder); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/array_t.hpp b/include/rfl/parsing/tabular/array_t.hpp new file mode 100644 index 00000000..6d083af8 --- /dev/null +++ b/include/rfl/parsing/tabular/array_t.hpp @@ -0,0 +1,13 @@ +#ifndef RFL_PARSING_TABULAR_ARRAYT_HPP_ +#define RFL_PARSING_TABULAR_ARRAYT_HPP_ + +#include "ArrowTypes.hpp" + +namespace rfl::parsing::tabular { + +template +using array_t = typename ArrowTypes>::ArrayType; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_arrow_builders.hpp b/include/rfl/parsing/tabular/make_arrow_builders.hpp new file mode 100644 index 00000000..501694ee --- /dev/null +++ b/include/rfl/parsing/tabular/make_arrow_builders.hpp @@ -0,0 +1,57 @@ +#ifndef RFL_PARSING_TABULAR_MAKEARROWBUILDERS_HPP_ +#define RFL_PARSING_TABULAR_MAKEARROWBUILDERS_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../named_tuple_t.hpp" +#include "ArrowTypes.hpp" + +namespace rfl::parsing::tabular { + +template +using arrow_builder_t = typename ArrowTypes< + std::remove_cvref_t>>::BuilderType; + +template +struct ArrowBuildersType; + +template +struct ArrowBuildersType> { + using Type = Tuple...>; + + static auto data_types() { + return [&](std::integer_sequence) { + return std::array, + sizeof...(FieldTypes)>( + {ArrowTypes::data_type()...}); + }(std::make_integer_sequence()); + } + + static Type make_builders() { + return Type(ArrowTypes::make_builder()...); + } + + static auto schema() { + const auto fields = std::vector>( + {arrow::field(typename FieldTypes::Name().str(), + ArrowTypes::data_type())...}); + return arrow::schema(fields); + } +}; + +template +auto make_arrow_builders() { + return ArrowBuildersType>::make_builders(); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_arrow_data_types.hpp b/include/rfl/parsing/tabular/make_arrow_data_types.hpp new file mode 100644 index 00000000..0fb237bd --- /dev/null +++ b/include/rfl/parsing/tabular/make_arrow_data_types.hpp @@ -0,0 +1,18 @@ +#ifndef RFL_PARSING_TABULAR_MAKE_ARROW_DATA_TYPES_HPP_ +#define RFL_PARSING_TABULAR_MAKE_ARROW_DATA_TYPES_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "make_arrow_builders.hpp" + +namespace rfl::parsing::tabular { + +template +inline auto make_arrow_data_types() { + return ArrowBuildersType>>::data_types(); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_arrow_schema.hpp b/include/rfl/parsing/tabular/make_arrow_schema.hpp new file mode 100644 index 00000000..b9c6268a --- /dev/null +++ b/include/rfl/parsing/tabular/make_arrow_schema.hpp @@ -0,0 +1,18 @@ +#ifndef RFL_PARSING_TABULAR_MAKE_ARROW_SCHEMA_HPP_ +#define RFL_PARSING_TABULAR_MAKE_ARROW_SCHEMA_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "make_arrow_builders.hpp" + +namespace rfl::parsing::tabular { + +template +inline auto make_arrow_schema() { + return ArrowBuildersType>>::schema(); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp new file mode 100644 index 00000000..697b8b87 --- /dev/null +++ b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp @@ -0,0 +1,57 @@ +#ifndef RFL_PARSING_TABULAR_MAKECHUNKEDARRAYITERATORS_HPP_ +#define RFL_PARSING_TABULAR_MAKECHUNKEDARRAYITERATORS_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../NamedTuple.hpp" +#include "../../Ref.hpp" +#include "../../Result.hpp" +#include "../../Tuple.hpp" +#include "ArrowTypes.hpp" +#include "ChunkedArrayIterator.hpp" + +namespace rfl::parsing::tabular { + +template +struct MakeChunkedArrayIterators; + +template +struct MakeChunkedArrayIterators> { + using TupleType = Tuple...>; + + Result operator()(const Ref& _table) const { + const auto get_column = + [&](const std::string& _colname) -> Result> { + const auto col = _table->GetColumnByName(_colname); + if (!col) { + return error("Column named '" + _colname + "' not found."); + } + return Ref::make(col); + }; + + try { + return TupleType( + get_column(typename FieldTypes::Name().str()) + .transform(ChunkedArrayIterator::make) + .value()...); + } catch (const std::exception& e) { + return error(e.what()); + } + } +}; + +template +const auto make_chunked_array_iterators = + MakeChunkedArrayIterators{}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/view_t.hpp b/include/rfl/view_t.hpp new file mode 100644 index 00000000..c1c7569d --- /dev/null +++ b/include/rfl/view_t.hpp @@ -0,0 +1,20 @@ +#ifndef RFL_VIEW_T_HPP_ +#define RFL_VIEW_T_HPP_ + +#include + +#include "Processors.hpp" +#include "internal/ptr_named_tuple_t.hpp" + +namespace rfl { + +/// Generates the named tuple that would be the result of to_view +template +using view_t = + std::invoke_result_t::template process< + T, internal::ptr_named_tuple_t>), + internal::ptr_named_tuple_t>; + +} // namespace rfl + +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae98d010..9e559b73 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,6 +36,10 @@ if (REFLECTCPP_MSGPACK) add_subdirectory(msgpack) endif() +if (REFLECTCPP_PARQUET) + add_subdirectory(parquet) +endif() + if (REFLECTCPP_TOML) add_subdirectory(toml) endif() diff --git a/tests/parquet/CMakeLists.txt b/tests/parquet/CMakeLists.txt new file mode 100644 index 00000000..c42b7a5b --- /dev/null +++ b/tests/parquet/CMakeLists.txt @@ -0,0 +1,21 @@ +project(reflect-cpp-parquet-tests) + +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS "*.cpp") + +add_executable( + reflect-cpp-parquet-tests + ${SOURCES} +) +target_precompile_headers(reflect-cpp-parquet-tests PRIVATE [["rfl.hpp"]] ) + +target_include_directories(reflect-cpp-parquet-tests SYSTEM PRIVATE "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include") + +target_link_libraries( + reflect-cpp-parquet-tests + PRIVATE + "${REFLECT_CPP_GTEST_LIB}" +) + +find_package(GTest) +gtest_discover_tests(reflect-cpp-parquet-tests) + diff --git a/tests/parquet/test_boolean.cpp b/tests/parquet/test_boolean.cpp new file mode 100644 index 00000000..e01f1ad3 --- /dev/null +++ b/tests/parquet/test_boolean.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_boolean { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + bool is_child; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_boolean) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .is_child = true, + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .is_child = true, + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .is_child = true, + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .is_child = false, + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_boolean diff --git a/tests/parquet/test_box.cpp b/tests/parquet/test_box.cpp new file mode 100644 index 00000000..8fd5098a --- /dev/null +++ b/tests/parquet/test_box.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_box { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Box email; +}; + +TEST(parquet, test_box) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_box("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_box("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_box("maggie@simpson.com")}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_box("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_box diff --git a/tests/parquet/test_bytestring.cpp b/tests/parquet/test_bytestring.cpp new file mode 100644 index 00000000..55e9f4e8 --- /dev/null +++ b/tests/parquet/test_bytestring.cpp @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_bytestring { + +struct TestStruct { + rfl::Bytestring bytestring; +}; + +TEST(parquet, test_bytestring) { + const auto test_struct = + TestStruct{.bytestring = rfl::Bytestring({std::byte{13}, std::byte{14}, + std::byte{15}, std::byte{16}})}; + + const auto test_structs = std::vector( + {test_struct, test_struct, test_struct, test_struct}); + + write_and_read(test_structs); +} +} // namespace test_bytestring diff --git a/tests/parquet/test_camel_case.cpp b/tests/parquet/test_camel_case.cpp new file mode 100644 index 00000000..7eeffa30 --- /dev/null +++ b/tests/parquet/test_camel_case.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_camel_case { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + std::string last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_camel_case) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_camel_case diff --git a/tests/parquet/test_deque.cpp b/tests/parquet/test_deque.cpp new file mode 100644 index 00000000..41b40ab8 --- /dev/null +++ b/tests/parquet/test_deque.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_deque { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_deque) { + const auto people = + std::deque({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_deque diff --git a/tests/parquet/test_enums.cpp b/tests/parquet/test_enums.cpp new file mode 100644 index 00000000..ff3b2091 --- /dev/null +++ b/tests/parquet/test_enums.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_enums { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_enums) { + const auto people = + std::vector({Person{.first_name = FirstName::Bart, + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = FirstName::Homer, + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_enums diff --git a/tests/parquet/test_flatten.cpp b/tests/parquet/test_flatten.cpp new file mode 100644 index 00000000..4c7c12e1 --- /dev/null +++ b/tests/parquet/test_flatten.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_flatten { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Address { + std::string street; + std::string city; +}; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; + rfl::Flatten
address; +}; + +TEST(parquet, test_flatten) { + const auto address = + Address{.street = "Evergreen Terrace", .city = "Springfield"}; + + const auto people = std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com", + .address = address}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com", + .address = address}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com", + .address = address}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com", + .address = address}}); + + write_and_read(people); +} +} // namespace test_flatten diff --git a/tests/parquet/test_gzip.cpp b/tests/parquet/test_gzip.cpp new file mode 100644 index 00000000..ee254395 --- /dev/null +++ b/tests/parquet/test_gzip.cpp @@ -0,0 +1,45 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_gzip { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_gzip) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + const auto settings = rfl::parquet::Settings{}.with_compression( + rfl::parquet::Compression::GZIP); + + write_and_read(people, settings); +} +} // namespace test_gzip diff --git a/tests/parquet/test_literal.cpp b/tests/parquet/test_literal.cpp new file mode 100644 index 00000000..1e36b6ca --- /dev/null +++ b/tests/parquet/test_literal.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_literal { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + rfl::Literal<"Simpson"> last_name; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_literal) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_literal diff --git a/tests/parquet/test_optionals.cpp b/tests/parquet/test_optionals.cpp new file mode 100644 index 00000000..b5d3df80 --- /dev/null +++ b/tests/parquet/test_optionals.cpp @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_optionals { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::optional email; +}; + +TEST(parquet, test_optionals) { + const auto people = std::vector( + {Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_optionals diff --git a/tests/parquet/test_readme_example.cpp b/tests/parquet/test_readme_example.cpp new file mode 100644 index 00000000..48fc082c --- /dev/null +++ b/tests/parquet/test_readme_example.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_readme_example { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_readme_example) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_readme_example diff --git a/tests/parquet/test_ref.cpp b/tests/parquet/test_ref.cpp new file mode 100644 index 00000000..03417a85 --- /dev/null +++ b/tests/parquet/test_ref.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_ref { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Ref email; +}; + +TEST(parquet, test_ref) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_ref("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_ref("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_ref("maggie@simpson.com")}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_ref("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_ref diff --git a/tests/parquet/test_save_load.cpp b/tests/parquet/test_save_load.cpp new file mode 100644 index 00000000..fe760dc6 --- /dev/null +++ b/tests/parquet/test_save_load.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace test_save_load { + +using Age = rfl::Validator, rfl::Maximum<130>>>; + +struct Person { + std::string first_name; + std::string last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_save_load) { + const auto people1 = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + rfl::parquet::save("people.parquet", people1); + + const auto people2 = + rfl::parquet::load>("people.parquet").value(); + + const auto bytes1 = rfl::parquet::write(people1); + const auto bytes2 = rfl::parquet::write(people2); + + EXPECT_EQ(bytes1, bytes2); +} +} // namespace test_save_load diff --git a/tests/parquet/test_shared_ptr.cpp b/tests/parquet/test_shared_ptr.cpp new file mode 100644 index 00000000..09f9c6e7 --- /dev/null +++ b/tests/parquet/test_shared_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_shared_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::shared_ptr email; +}; + +TEST(parquet, test_shared_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_shared("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_shared("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_shared("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_shared_ptr diff --git a/tests/parquet/test_unique_ptr.cpp b/tests/parquet/test_unique_ptr.cpp new file mode 100644 index 00000000..1df85ffa --- /dev/null +++ b/tests/parquet/test_unique_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_unique_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::unique_ptr email; +}; + +TEST(parquet, test_unique_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_unique("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_unique("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_unique("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_unique_ptr diff --git a/tests/parquet/write_and_read.hpp b/tests/parquet/write_and_read.hpp new file mode 100644 index 00000000..53ee3b9c --- /dev/null +++ b/tests/parquet/write_and_read.hpp @@ -0,0 +1,22 @@ +#ifndef WRITE_AND_READ_ +#define WRITE_AND_READ_ + +#include + +#include +#include +#include + +template +void write_and_read(const auto& _vec, const rfl::parquet::Settings& _settings = + rfl::parquet::Settings{}) { + using T = std::remove_cvref_t; + const auto serialized1 = rfl::parquet::write(_vec, _settings); + const auto res = rfl::parquet::read(serialized1); + EXPECT_TRUE(res && true) << "Test failed on read. Error: " + << res.error().what(); + const auto serialized2 = rfl::parquet::write(res.value(), _settings); + EXPECT_EQ(serialized1, serialized2); +} + +#endif diff --git a/vcpkg.json b/vcpkg.json index 56a0e150..635d4e09 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -110,6 +110,16 @@ } ] }, + "parquet": { + "description": "Enable parquet support", + "dependencies": [ + { + "name": "arrow", + "version>=": "21.0.0", + "features": ["parquet"] + } + ] + }, "tests": { "description": "Compile the tests", "dependencies": [