From 24cf623ff6e9e64c38b63691ef2f10909d199eb4 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Wed, 3 Sep 2025 22:11:23 +0200 Subject: [PATCH 01/14] Started developing the CSV reader and writer --- CMakeLists.txt | 16 ++++++- include/rfl/csv.hpp | 10 +++++ include/rfl/csv/Settings.hpp | 22 +++++++++ include/rfl/csv/load.hpp | 20 +++++++++ include/rfl/csv/read.hpp | 74 +++++++++++++++++++++++++++++++ include/rfl/csv/save.hpp | 24 ++++++++++ include/rfl/csv/write.hpp | 74 +++++++++++++++++++++++++++++++ reflectcpp-config.cmake.in | 11 +++++ tests/CMakeLists.txt | 4 ++ tests/csv/CMakeLists.txt | 21 +++++++++ tests/csv/test_readme_example.cpp | 44 ++++++++++++++++++ tests/csv/write_and_read.hpp | 22 +++++++++ vcpkg.json | 10 +++++ 13 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 include/rfl/csv.hpp create mode 100644 include/rfl/csv/Settings.hpp create mode 100644 include/rfl/csv/load.hpp create mode 100644 include/rfl/csv/read.hpp create mode 100644 include/rfl/csv/save.hpp create mode 100644 include/rfl/csv/write.hpp create mode 100644 tests/csv/CMakeLists.txt create mode 100644 tests/csv/test_readme_example.cpp create mode 100644 tests/csv/write_and_read.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e811cb42..e73bd25a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ option(REFLECTCPP_AVRO "Enable AVRO support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_BSON "Enable BSON support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_CAPNPROTO "Enable Cap’n Proto support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_CBOR "Enable CBOR support" ${REFLECTCPP_ALL_FORMATS}) +option(REFLECTCPP_CSV "Enable CSV support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_FLEXBUFFERS "Enable flexbuffers support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_MSGPACK "Enable msgpack support" ${REFLECTCPP_ALL_FORMATS}) option(REFLECTCPP_PARQUET "Enable parquet support" ${REFLECTCPP_ALL_FORMATS}) @@ -55,8 +56,8 @@ endif() if (REFLECTCPP_BUILD_TESTS OR REFLECTCPP_BUILD_BENCHMARKS OR (REFLECTCPP_JSON AND NOT REFLECTCPP_USE_BUNDLED_DEPENDENCIES) OR REFLECTCPP_AVRO OR - REFLECTCPP_BSON OR REFLECTCPP_CAPNPROTO OR REFLECTCPP_CBOR OR REFLECTCPP_FLEXBUFFERS OR - REFLECTCPP_MSGPACK OR REFLECTCPP_PARQUET OR REFLECTCPP_XML OR + REFLECTCPP_BSON OR REFLECTCPP_CAPNPROTO OR REFLECTCPP_CBOR OR REFLECTCPP_CSV OR + REFLECTCPP_FLEXBUFFERS OR REFLECTCPP_MSGPACK OR REFLECTCPP_PARQUET OR REFLECTCPP_XML OR REFLECTCPP_TOML OR REFLECTCPP_UBJSON OR REFLECTCPP_YAML) # enable vcpkg per default if features other than JSON are required set(REFLECTCPP_USE_VCPKG_DEFAULT ON) @@ -95,6 +96,10 @@ if (REFLECTCPP_USE_VCPKG) list(APPEND VCPKG_MANIFEST_FEATURES "cbor") endif() + if (REFLECTCPP_CSV) + list(APPEND VCPKG_MANIFEST_FEATURES "csv") + endif() + if (NOT REFLECTCPP_USE_BUNDLED_DEPENDENCIES) list(APPEND VCPKG_MANIFEST_FEATURES "ctre") endif() @@ -246,6 +251,13 @@ if (REFLECTCPP_CBOR) include_directories(PUBLIC ${jsoncons_INCLUDE_DIRS}) endif () +if (REFLECTCPP_CSV) + if (NOT TARGET Arrow) + find_package(Arrow CONFIG REQUIRED) + endif() + target_link_libraries(reflectcpp PUBLIC "$,Arrow::arrow_static,Arrow::arrow_shared>") +endif () + if (REFLECTCPP_FLEXBUFFERS) list(APPEND REFLECT_CPP_SOURCES src/reflectcpp_flexbuf.cpp diff --git a/include/rfl/csv.hpp b/include/rfl/csv.hpp new file mode 100644 index 00000000..fc657579 --- /dev/null +++ b/include/rfl/csv.hpp @@ -0,0 +1,10 @@ +#ifndef RFL_CSV_HPP_ +#define RFL_CSV_HPP_ + +#include "../rfl.hpp" +#include "csv/load.hpp" +#include "csv/read.hpp" +#include "csv/save.hpp" +#include "csv/write.hpp" + +#endif diff --git a/include/rfl/csv/Settings.hpp b/include/rfl/csv/Settings.hpp new file mode 100644 index 00000000..6875f76a --- /dev/null +++ b/include/rfl/csv/Settings.hpp @@ -0,0 +1,22 @@ +#ifndef RFL_CSV_SETTINGS_HPP_ +#define RFL_CSV_SETTINGS_HPP_ + +#include + +#include "../Field.hpp" +#include "../replace.hpp" + +namespace rfl::csv { + +struct Settings { + /// The size of the chunks of the csv file. + size_t chunksize = 2000; + + Settings with_chunksize(const size_t _chunksize) const noexcept { + return replace(*this, make_field<"chunksize">(_chunksize)); + } +}; + +} // namespace rfl::csv + +#endif diff --git a/include/rfl/csv/load.hpp b/include/rfl/csv/load.hpp new file mode 100644 index 00000000..ecfa06b3 --- /dev/null +++ b/include/rfl/csv/load.hpp @@ -0,0 +1,20 @@ +#ifndef RFL_CSV_CSV_HPP_ +#define RFL_CSV_CSV_HPP_ + +#include "../Result.hpp" +#include "../io/load_string.hpp" +#include "read.hpp" + +namespace rfl::csv { + +template +Result load(const std::string& _fname) { + const auto read_string = [](const auto& _str) { + return read(_str); + }; + return rfl::io::load_string(_fname).and_then(read_string); +} + +} // namespace rfl::csv + +#endif diff --git a/include/rfl/csv/read.hpp b/include/rfl/csv/read.hpp new file mode 100644 index 00000000..e4e7b1b0 --- /dev/null +++ b/include/rfl/csv/read.hpp @@ -0,0 +1,74 @@ +#ifndef RFL_CSV_READ_HPP_ +#define RFL_CSV_READ_HPP_ + +#include +#include + +#include +#include +#include + +#include "../Processors.hpp" +#include "../Result.hpp" +#include "../concepts.hpp" +#include "../internal/wrap_in_rfl_array_t.hpp" +#include "../parsing/tabular/ArrowReader.hpp" + +namespace rfl::csv { + +/// Parses an object from CSV using reflection. +template +Result> read(const char* _str, + const size_t _size) { + arrow::io::IOContext io_context = arrow::io::default_io_context(); + + const auto buffer = std::make_shared( + internal::ptr_cast(_str), _size); + + std::shared_ptr input = + std::make_shared(buffer); + + auto read_options = arrow::csv::ReadOptions::Defaults(); + auto parse_options = arrow::csv::ParseOptions::Defaults(); + auto convert_options = arrow::csv::ConvertOptions::Defaults(); + + auto maybe_reader = arrow::csv::TableReader::Make( + io_context, input, read_options, parse_options, convert_options); + + if (!maybe_reader.ok()) { + return error("Could not construct CSV reader: " + + maybe_reader.status().message()); + } + + std::shared_ptr reader = *maybe_reader; + + auto maybe_table = reader->Read(); + if (!maybe_table.ok()) { + return error("Could not read table: " + maybe_table.status().message()); + } + + const std::shared_ptr table = *maybe_table; + + using ArrowReader = parsing::tabular::ArrowReader; + + return ArrowReader::make(table).and_then( + [](const auto& _r) { return _r.read(); }); +} + +/// Parses an object from CSV using reflection. +template +auto read(const std::string& _str) { + return read(_str.c_str(), _str.size()); +} + +/// Parses an object from a stream. +template +auto read(std::istream& _stream) { + std::istreambuf_iterator begin(_stream), end; + auto bytes = std::vector(begin, end); + return read(bytes.data(), bytes.size()); +} + +} // namespace rfl::csv + +#endif diff --git a/include/rfl/csv/save.hpp b/include/rfl/csv/save.hpp new file mode 100644 index 00000000..806fa92f --- /dev/null +++ b/include/rfl/csv/save.hpp @@ -0,0 +1,24 @@ +#ifndef RFL_CSV_SAVE_HPP_ +#define RFL_CSV_SAVE_HPP_ + +#include +#include +#include + +#include "../Result.hpp" +#include "../io/save_string.hpp" +#include "write.hpp" + +namespace rfl::csv { + +template +Result save(const std::string& _fname, const auto& _obj) { + const auto write_func = [](const auto& _obj, auto& _stream) -> auto& { + return write(_obj, _stream); + }; + return rfl::io::save_string(_fname, _obj, write_func); +} + +} // namespace rfl::csv + +#endif diff --git a/include/rfl/csv/write.hpp b/include/rfl/csv/write.hpp new file mode 100644 index 00000000..d00a69a1 --- /dev/null +++ b/include/rfl/csv/write.hpp @@ -0,0 +1,74 @@ +#ifndef RFL_CSV_WRITE_HPP_ +#define RFL_CSV_WRITE_HPP_ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../Processors.hpp" +#include "../Ref.hpp" +#include "../parsing/tabular/ArrowWriter.hpp" +#include "Settings.hpp" + +namespace rfl::csv { + +/// Returns CSV bytes. +template +Ref to_buffer(const auto& _arr, const Settings& _settings) { + using T = std::remove_cvref_t; + + const auto table = + parsing::tabular::ArrowWriter(_settings.chunksize) + .to_table(_arr); + + const auto output_buffer = arrow::io::BufferOutputStream::Create(); + + if (!output_buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + const auto status = + arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), + output_buffer.ValueOrDie().get()); + + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + + const auto buffer = output_buffer.ValueOrDie()->Finish(); + + if (!buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + return Ref::make(buffer.ValueOrDie()).value(); +} + +/// Returns CSV bytes. +template +std::string write(const auto& _arr, const Settings& _settings = Settings{}) { + const auto buffer = to_buffer(_arr, _settings); + const auto view = std::string_view(*buffer); + return std::string(view); +} + +/// Writes a CSV into an ostream. +template +std::ostream& write(const auto& _arr, std::ostream& _stream, + const Settings& _settings = Settings{}) noexcept { + auto buffer = to_buffer(_arr, _settings); + _stream << std::string_view(*buffer); + return _stream; +} + +} // namespace rfl::csv + +#endif diff --git a/reflectcpp-config.cmake.in b/reflectcpp-config.cmake.in index c38da966..128a89c7 100644 --- a/reflectcpp-config.cmake.in +++ b/reflectcpp-config.cmake.in @@ -4,8 +4,10 @@ set(REFLECTCPP_JSON @REFLECTCPP_JSON@) set(REFLECTCPP_BSON @REFLECTCPP_BSON@) set(REFLECTCPP_CAPNPROTO @REFLECTCPP_CAPNPROTO@) set(REFLECTCPP_CBOR @REFLECTCPP_CBOR@) +set(REFLECTCPP_CSV @REFLECTCPP_CSV@) set(REFLECTCPP_FLEXBUFFERS @REFLECTCPP_FLEXBUFFERS@) set(REFLECTCPP_MSGPACK @REFLECTCPP_MSGPACK@) +set(REFLECTCPP_PARQUET @REFLECTCPP_PARQUET@) set(REFLECTCPP_TOML @REFLECTCPP_TOML@) set(REFLECTCPP_UBJSON @REFLECTCPP_UBJSON@) set(REFLECTCPP_XML @REFLECTCPP_XML@) @@ -38,6 +40,10 @@ if (REFLECTCPP_CBOR OR REFLECTCPP_UBJSON) find_dependency(jsoncons) endif () +if (REFLECTCPP_CSV) + find_dependency(Arrow) +endif() + if (REFLECTCPP_FLEXBUFFERS) find_dependency(flatbuffers) endif () @@ -46,6 +52,11 @@ if (REFLECTCPP_MSGPACK) find_dependency(msgpack-c) endif() +if (REFLECTCPP_PARQUET) + find_dependency(Arrow) + find_dependency(Parquet) +endif() + if (REFLECTCPP_TOML) find_dependency(tomlplusplus) endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9e559b73..bfcab240 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,6 +28,10 @@ if (REFLECTCPP_CBOR) add_subdirectory(cbor) endif() +if (REFLECTCPP_CSV) + add_subdirectory(csv) +endif() + if (REFLECTCPP_FLEXBUFFERS) add_subdirectory(flexbuffers) endif() diff --git a/tests/csv/CMakeLists.txt b/tests/csv/CMakeLists.txt new file mode 100644 index 00000000..a435a433 --- /dev/null +++ b/tests/csv/CMakeLists.txt @@ -0,0 +1,21 @@ +project(reflect-cpp-csv-tests) + +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS "*.cpp") + +add_executable( + reflect-cpp-csv-tests + ${SOURCES} +) +target_precompile_headers(reflect-cpp-csv-tests PRIVATE [["rfl.hpp"]] ) + +target_include_directories(reflect-cpp-csv-tests SYSTEM PRIVATE "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include") + +target_link_libraries( + reflect-cpp-csv-tests + PRIVATE + "${REFLECT_CPP_GTEST_LIB}" +) + +find_package(GTest) +gtest_discover_tests(reflect-cpp-csv-tests) + diff --git a/tests/csv/test_readme_example.cpp b/tests/csv/test_readme_example.cpp new file mode 100644 index 00000000..31727ef9 --- /dev/null +++ b/tests/csv/test_readme_example.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_readme_example { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + // TODO + // rfl::Timestamp<"%Y-%m-%d"> birthday; + // std::string birthday; + // Age age; + rfl::Email email; +}; + +TEST(parquet, test_readme_example) { + const auto people = + std::vector({Person{.first_name = "Bart", + //.birthday = "1987-04-19", + //.age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + //.birthday = "1987-04-19", + //.age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + //.birthday = "1987-04-19", + //.age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + //.birthday = "1987-04-19", + //.age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_readme_example diff --git a/tests/csv/write_and_read.hpp b/tests/csv/write_and_read.hpp new file mode 100644 index 00000000..b9da1944 --- /dev/null +++ b/tests/csv/write_and_read.hpp @@ -0,0 +1,22 @@ +#ifndef WRITE_AND_READ_ +#define WRITE_AND_READ_ + +#include + +#include +#include +#include + +template +void write_and_read(const auto& _vec, const rfl::csv::Settings& _settings = + rfl::csv::Settings{}) { + using T = std::remove_cvref_t; + const auto serialized1 = rfl::csv::write(_vec, _settings); + const auto res = rfl::csv::read(serialized1); + EXPECT_TRUE(res && true) << "Test failed on read. Error: " + << res.error().what(); + const auto serialized2 = rfl::csv::write(res.value(), _settings); + EXPECT_EQ(serialized1, serialized2); +} + +#endif diff --git a/vcpkg.json b/vcpkg.json index 635d4e09..79c2900b 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -74,6 +74,16 @@ } ] }, + "csv": { + "description": "Enable CSV support", + "dependencies": [ + { + "name": "arrow", + "version>=": "21.0.0", + "features": ["csv"] + } + ] + }, "ctre": { "description": "Install CTRE using vcpkg instead of using the bundled version", "dependencies": [ From 81204de2f63428ee63b81745aacd18af561e6bb9 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 15:09:36 +0200 Subject: [PATCH 02/14] Made sure all types are handled correctly --- include/rfl/csv/read.hpp | 5 +- include/rfl/parsing/tabular/ArrowReader.hpp | 7 +- include/rfl/parsing/tabular/ArrowTypes.hpp | 325 +++++++++++++++++- .../parsing/tabular/ChunkedArrayIterator.hpp | 7 +- tests/csv/test_readme_example.cpp | 24 +- tests/csv/test_save_load.cpp | 53 +++ 6 files changed, 397 insertions(+), 24 deletions(-) create mode 100644 tests/csv/test_save_load.cpp diff --git a/include/rfl/csv/read.hpp b/include/rfl/csv/read.hpp index e4e7b1b0..5fabb3e4 100644 --- a/include/rfl/csv/read.hpp +++ b/include/rfl/csv/read.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "../Processors.hpp" #include "../Result.hpp" @@ -57,8 +58,8 @@ Result> read(const char* _str, /// Parses an object from CSV using reflection. template -auto read(const std::string& _str) { - return read(_str.c_str(), _str.size()); +auto read(const std::string_view _str) { + return read(_str.data(), _str.size()); } /// Parses an object from a stream. diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 153778b7..140707d6 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -78,8 +78,11 @@ class ArrowReader { Result new_value(auto* _chunked_array_iterators) const noexcept { alignas(ValueType) unsigned char buf[sizeof(ValueType)]{}; auto ptr = internal::ptr_cast(&buf); + auto view = to_view(*ptr); + using ViewType = std::remove_cvref_t; + try { const auto set_one = [&](std::integral_constant) { using FieldType = tuple_element_t<_i, typename ViewType::Fields>; @@ -99,11 +102,11 @@ class ArrowReader { [&](std::integer_sequence) { (set_one(std::integral_constant{}), ...); }(std::make_integer_sequence()); - - return std::move(*ptr); } catch (const std::exception& e) { return error(e.what()); } + + return std::move(*ptr); } template diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index 2c7bee83..048d0226 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -29,6 +29,10 @@ namespace rfl::parsing::tabular { template struct ArrowTypes; +template +Result::ArrayType>> transform_numerical_array( + const std::shared_ptr& _arr) noexcept; + template <> struct ArrowTypes { using ArrayType = arrow::BooleanArray; @@ -43,6 +47,16 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + if (_arr->type()->Equals(data_type())) { + return Ref::make(std::static_pointer_cast(_arr)); + } else { + return error("Expected boolean array, got " + _arr->type()->ToString() + + "."); + } + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -55,6 +69,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::UInt8Array; using BuilderType = arrow::UInt8Builder; + using T = uint8_t; static auto data_type() { return arrow::uint8(); } @@ -65,6 +80,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -77,6 +97,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::UInt16Array; using BuilderType = arrow::UInt16Builder; + using T = uint16_t; static auto data_type() { return arrow::uint16(); } @@ -87,6 +108,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -99,6 +125,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::UInt32Array; using BuilderType = arrow::UInt32Builder; + using T = uint32_t; static auto data_type() { return arrow::uint32(); } @@ -109,6 +136,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -121,6 +153,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::UInt64Array; using BuilderType = arrow::UInt64Builder; + using T = uint64_t; static auto data_type() { return arrow::uint64(); } @@ -131,6 +164,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -143,6 +181,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::Int8Array; using BuilderType = arrow::Int8Builder; + using T = int8_t; static auto data_type() { return arrow::int8(); } @@ -153,6 +192,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -165,6 +209,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::Int16Array; using BuilderType = arrow::Int16Builder; + using T = int16_t; static auto data_type() { return arrow::int16(); } @@ -175,6 +220,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -187,6 +237,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::Int32Array; using BuilderType = arrow::Int32Builder; + using T = int32_t; static auto data_type() { return arrow::int32(); } @@ -197,6 +248,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -209,6 +265,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::Int64Array; using BuilderType = arrow::Int64Builder; + using T = int64_t; static auto data_type() { return arrow::int64(); } @@ -219,6 +276,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -231,6 +293,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::FloatArray; using BuilderType = arrow::FloatBuilder; + using T = float; static auto data_type() { return arrow::float32(); } @@ -241,6 +304,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -253,6 +321,7 @@ template <> struct ArrowTypes { using ArrayType = arrow::DoubleArray; using BuilderType = arrow::DoubleBuilder; + using T = double; static auto data_type() { return arrow::float64(); } @@ -263,6 +332,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return transform_numerical_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return _chunk->Value(_ix); @@ -285,6 +359,16 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + if (_arr->type()->Equals(data_type())) { + return Ref::make(std::static_pointer_cast(_arr)); + } else { + return error("Expected string array, got " + _arr->type()->ToString() + + "."); + } + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return std::string(_chunk->Value(_ix)); @@ -308,6 +392,11 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return string_to_enum(std::string(_chunk->Value(_ix))); } @@ -331,6 +420,16 @@ struct ArrowTypes { } } + static Result> get_array( + const std::shared_ptr& _arr) { + if (_arr->type()->Equals(data_type())) { + return Ref::make(std::static_pointer_cast(_arr)); + } else { + return error("Expected binary array, got " + _arr->type()->ToString() + + "."); + } + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { const auto begin = internal::ptr_cast( _chunk->Value(_ix).data()); @@ -345,15 +444,43 @@ struct ArrowTypes> { using ArrayType = arrow::TimestampArray; using BuilderType = arrow::TimestampBuilder; - static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } + static auto data_type() { return arrow::timestamp(arrow::TimeUnit::MILLI); } static void add_to_builder(const auto& _val, BuilderType* _builder) { - const auto status = _builder->Append(_val.to_time_t()); + const auto status = _builder->Append(_val.to_time_t() * 1000); if (!status.ok()) { throw std::runtime_error(status.message()); } } + static Result> get_array( + const std::shared_ptr& _arr) { + if (_arr->type()->Equals(data_type())) { + return Ref::make(std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals( + arrow::timestamp(arrow::TimeUnit::SECOND))) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals(arrow::timestamp(arrow::TimeUnit::MICRO))) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals(arrow::timestamp(arrow::TimeUnit::NANO))) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals(arrow::date64())) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + + } else { + return error("Expected timestamp or date64 array, got " + + _arr->type()->ToString() + "."); + } + } + static Result> get_value(const Ref& _chunk, const int64_t _ix) { return Timestamp<_format>(_chunk->Value(_ix) / 1000); @@ -362,6 +489,66 @@ struct ArrowTypes> { static auto make_builder() { return BuilderType(data_type(), arrow::default_memory_pool()); } + + template + static Result> transform_time_stamp( + const std::shared_ptr& _arr) noexcept { + if (!_arr) { + return error( + "transform_time_stamp: std::shared_ptr not set. This is a " + "bug, please report."); + } + + auto builder = + arrow::TimestampBuilder(data_type(), arrow::default_memory_pool()); + + for (int64_t i = 0; i < _arr->length(); ++i) { + if (_arr->IsNull(i)) { + const auto status = builder.AppendNull(); + if (!status.ok()) { + return error(status.message()); + } + } else { + if constexpr (_unit == arrow::TimeUnit::SECOND) { + const auto status = + builder.Append(static_cast(_arr->Value(i) * 1000)); + if (!status.ok()) { + return error(status.message()); + } + + } else if constexpr (_unit == arrow::TimeUnit::MILLI) { + const auto status = + builder.Append(static_cast(_arr->Value(i))); + if (!status.ok()) { + return error(status.message()); + } + + } else if constexpr (_unit == arrow::TimeUnit::MICRO) { + const auto status = + builder.Append(static_cast(_arr->Value(i) / 1000)); + if (!status.ok()) { + return error(status.message()); + } + + } else if constexpr (_unit == arrow::TimeUnit::NANO) { + const auto status = + builder.Append(static_cast(_arr->Value(i) / 1000000)); + if (!status.ok()) { + return error(status.message()); + } + + } else { + static_assert(rfl::always_false_v, + "Unsupported time unit."); + } + } + } + + std::shared_ptr res; + const auto status = builder.Finish(&res); + return Ref::make( + std::static_pointer_cast(res)); + } }; template @@ -380,6 +567,11 @@ struct ArrowTypes { _builder); } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static Result get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>:: get_value(_chunk, _ix) @@ -415,6 +607,11 @@ struct ArrowTypes> { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_optional(_v); }); @@ -441,6 +638,11 @@ struct ArrowTypes> { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_shared(_v); }); @@ -467,6 +669,11 @@ struct ArrowTypes> { } } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_unique(_v); }); @@ -486,6 +693,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(*_val, _builder); } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Box::make(_v); }); @@ -505,6 +717,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(*_val, _builder); } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Ref::make(_v); }); @@ -524,6 +741,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(_val.value(), _builder); } + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes::get_array(_arr); + } + static auto get_value(const Ref& _chunk, const int64_t _ix) { return ArrowTypes>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Rename<_name, T>(_v); }); @@ -532,6 +754,105 @@ struct ArrowTypes> { static auto make_builder() { return ArrowTypes::make_builder(); } }; +template +Result::ArrayType>> transform_numerical_array_impl( + const std::shared_ptr& _arr) noexcept { + if (!_arr) { + return error( + "transform_numerical_array_impl: std::shared_ptr not set. This is a " + "bug, please report."); + } + + auto builder = ArrowTypes::make_builder(); + + for (int64_t i = 0; i < _arr->length(); ++i) { + if (_arr->IsNull(i)) { + const auto status = builder.AppendNull(); + if (!status.ok()) { + return error(status.message()); + } + } else { + const auto status = builder.Append(static_cast(_arr->Value(i))); + if (!status.ok()) { + return error(status.message()); + } + } + } + + using TargetArrayType = typename ArrowTypes::ArrayType; + + std::shared_ptr res; + const auto status = builder.Finish(&res); + return Ref::make( + std::static_pointer_cast(res)); +} + +template +Result::ArrayType>> transform_numerical_array( + const std::shared_ptr& _arr) noexcept { + if (!_arr) { + return error( + "Could not transform the numerical array. std::shared_ptr not set."); + } + + using ArrayType = typename ArrowTypes::ArrayType; + + if (_arr->type()->Equals(ArrowTypes::data_type())) { + return Ref::make(std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>(_arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>(_arr)); + + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>(_arr)); + + } else { + return error("Expected numerical array, got " + _arr->type()->ToString() + + "."); + } +} + } // namespace rfl::parsing::tabular #endif diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index 68b229bf..c22cad1f 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -51,9 +51,7 @@ class ChunkedArrayIterator { [&](const auto& _c) { return ArrowTypes::get_value(_c, ix_); }); } - bool end() const noexcept { - return !current_chunk_ || (chunk_ix_ >= arr_->num_chunks()); - } + bool end() const noexcept { return chunk_ix_ >= arr_->num_chunks(); } ChunkedArrayIterator& operator++() noexcept { if (!current_chunk_) { @@ -74,8 +72,7 @@ class ChunkedArrayIterator { static Result> get_chunk(const Ref& _arr, const int _chunk_ix) noexcept { if (_chunk_ix < _arr->num_chunks()) { - return Ref::make( - std::static_pointer_cast(_arr->chunk(_chunk_ix))); + return ArrowTypes::get_array(_arr->chunk(_chunk_ix)); } else { return error("chunk_ix out of bounds."); } diff --git a/tests/csv/test_readme_example.cpp b/tests/csv/test_readme_example.cpp index 31727ef9..ca0964ed 100644 --- a/tests/csv/test_readme_example.cpp +++ b/tests/csv/test_readme_example.cpp @@ -13,30 +13,28 @@ struct Person { rfl::Rename<"firstName", std::string> first_name; rfl::Rename<"lastName", std::string> last_name = "Simpson"; std::string town = "Springfield"; - // TODO - // rfl::Timestamp<"%Y-%m-%d"> birthday; - // std::string birthday; - // Age age; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; rfl::Email email; }; -TEST(parquet, test_readme_example) { +TEST(csv, test_readme_example) { const auto people = std::vector({Person{.first_name = "Bart", - //.birthday = "1987-04-19", - //.age = 10, + .birthday = "1987-04-19", + .age = 10, .email = "bart@simpson.com"}, Person{.first_name = "Lisa", - //.birthday = "1987-04-19", - //.age = 8, + .birthday = "1987-04-19", + .age = 8, .email = "lisa@simpson.com"}, Person{.first_name = "Maggie", - //.birthday = "1987-04-19", - //.age = 0, + .birthday = "1987-04-19", + .age = 0, .email = "maggie@simpson.com"}, Person{.first_name = "Homer", - //.birthday = "1987-04-19", - //.age = 45, + .birthday = "1987-04-19", + .age = 45, .email = "homer@simpson.com"}}); write_and_read(people); diff --git a/tests/csv/test_save_load.cpp b/tests/csv/test_save_load.cpp new file mode 100644 index 00000000..8cf61965 --- /dev/null +++ b/tests/csv/test_save_load.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace test_save_load { + +using Age = + rfl::Validator, rfl::Maximum<130>>>; + +struct Person { + std::string first_name; + std::string last_name = "Simpson"; + std::string town = "Springfield"; + // rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_save_load) { + const auto people1 = + std::vector({Person{.first_name = "Bart", + //.birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + //.birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + //.birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + //.birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + rfl::csv::save("people.csv", people1); + + const auto people2 = + rfl::csv::load>("people.csv").value(); + + const auto bytes1 = rfl::csv::write(people1); + const auto bytes2 = rfl::csv::write(people2); + + EXPECT_EQ(bytes1, bytes2); +} +} // namespace test_save_load From 3d3deb032f3523e8c83d39f5f61b8965f6ed4e79 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 15:10:00 +0200 Subject: [PATCH 03/14] Ignore CSV --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1b782f9d..8c7aedc6 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ *.bson *.capnproto *.cbor +*.csv *.json *.fb *.flexbuf From b1aaf7b7bcbf033de09f26b322f5f3b58496aeb1 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 17:01:29 +0200 Subject: [PATCH 04/14] Make sure timestamps are handled correctly --- include/rfl/Timestamp.hpp | 6 + include/rfl/csv/read.hpp | 4 +- include/rfl/csv/write.hpp | 3 +- include/rfl/parsing/tabular/ArrowReader.hpp | 5 +- include/rfl/parsing/tabular/ArrowTypes.hpp | 400 ++++++++++-------- include/rfl/parsing/tabular/ArrowWriter.hpp | 14 +- .../parsing/tabular/ChunkedArrayIterator.hpp | 10 +- .../rfl/parsing/tabular/add_to_builder.hpp | 5 +- include/rfl/parsing/tabular/array_t.hpp | 4 +- .../parsing/tabular/make_arrow_builders.hpp | 30 +- .../parsing/tabular/make_arrow_data_types.hpp | 5 +- .../rfl/parsing/tabular/make_arrow_schema.hpp | 4 +- .../tabular/make_chunked_array_iterators.hpp | 17 +- tests/csv/test_save_load.cpp | 10 +- 14 files changed, 298 insertions(+), 219 deletions(-) diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index 9909c0f0..31eaa657 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -64,6 +64,12 @@ class Timestamp { return from_string(_str.c_str()); } + /// Returns a result containing the timestamp when successful or an Error + /// otherwise. + static Result make(const auto& _str) noexcept { + return from_string(_str); + } + /// Necessary for the serialization to work. ReflectionType reflection() const { char outstr[200]; diff --git a/include/rfl/csv/read.hpp b/include/rfl/csv/read.hpp index 5fabb3e4..a18bea05 100644 --- a/include/rfl/csv/read.hpp +++ b/include/rfl/csv/read.hpp @@ -50,7 +50,9 @@ Result> read(const char* _str, const std::shared_ptr table = *maybe_table; - using ArrowReader = parsing::tabular::ArrowReader; + using ArrowReader = + parsing::tabular::ArrowReader; return ArrowReader::make(table).and_then( [](const auto& _r) { return _r.read(); }); diff --git a/include/rfl/csv/write.hpp b/include/rfl/csv/write.hpp index d00a69a1..de1d6d16 100644 --- a/include/rfl/csv/write.hpp +++ b/include/rfl/csv/write.hpp @@ -26,7 +26,8 @@ Ref to_buffer(const auto& _arr, const Settings& _settings) { using T = std::remove_cvref_t; const auto table = - parsing::tabular::ArrowWriter(_settings.chunksize) + parsing::tabular::ArrowWriter(_settings.chunksize) .to_table(_arr); const auto output_buffer = arrow::io::BufferOutputStream::Create(); diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 140707d6..5956f468 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -23,7 +23,7 @@ namespace rfl::parsing::tabular { -template +template class ArrowReader { static_assert(!Processors::add_tags_to_variants_, "rfl::AddTagsToVariants cannot be used for tabular data."); @@ -51,7 +51,8 @@ class ArrowReader { ~ArrowReader() = default; Result read() const noexcept { - return make_chunked_array_iterators>(table_) + return make_chunked_array_iterators, _s>( + table_) .and_then([&](auto chunked_array_iterators) -> Result { VecType result; while (!end(chunked_array_iterators)) { diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index 048d0226..dcf4e3c5 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -26,15 +26,17 @@ namespace rfl::parsing::tabular { -template +enum class SerializationType { csv, parquet }; + +template struct ArrowTypes; -template -Result::ArrayType>> transform_numerical_array( +template +Result::ArrayType>> transform_numerical_array( const std::shared_ptr& _arr) noexcept; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::BooleanArray; using BuilderType = arrow::BooleanBuilder; @@ -65,8 +67,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::UInt8Array; using BuilderType = arrow::UInt8Builder; using T = uint8_t; @@ -82,7 +84,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -93,8 +95,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::UInt16Array; using BuilderType = arrow::UInt16Builder; using T = uint16_t; @@ -110,7 +112,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -121,8 +123,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::UInt32Array; using BuilderType = arrow::UInt32Builder; using T = uint32_t; @@ -138,7 +140,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -149,8 +151,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::UInt64Array; using BuilderType = arrow::UInt64Builder; using T = uint64_t; @@ -166,7 +168,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -177,8 +179,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::Int8Array; using BuilderType = arrow::Int8Builder; using T = int8_t; @@ -194,7 +196,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -205,8 +207,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::Int16Array; using BuilderType = arrow::Int16Builder; using T = int16_t; @@ -222,7 +224,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -233,8 +235,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::Int32Array; using BuilderType = arrow::Int32Builder; using T = int32_t; @@ -250,7 +252,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -261,8 +263,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::Int64Array; using BuilderType = arrow::Int64Builder; using T = int64_t; @@ -278,7 +280,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -289,8 +291,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::FloatArray; using BuilderType = arrow::FloatBuilder; using T = float; @@ -306,7 +308,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -317,8 +319,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::DoubleArray; using BuilderType = arrow::DoubleBuilder; using T = double; @@ -334,7 +336,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return transform_numerical_array(_arr); + return transform_numerical_array(_arr); } static Result get_value(const Ref& _chunk, @@ -345,8 +347,8 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template <> -struct ArrowTypes { +template +struct ArrowTypes { using ArrayType = arrow::StringArray; using BuilderType = arrow::StringBuilder; @@ -377,9 +379,9 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template +template requires enchantum::Enum -struct ArrowTypes { +struct ArrowTypes { using ArrayType = arrow::StringArray; using BuilderType = arrow::StringBuilder; @@ -394,7 +396,7 @@ struct ArrowTypes { static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static Result get_value(const Ref& _chunk, const int64_t _ix) { @@ -404,9 +406,9 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template +template requires concepts::ContiguousByteContainer -struct ArrowTypes { +struct ArrowTypes { using ArrayType = arrow::BinaryArray; using BuilderType = arrow::BinaryBuilder; @@ -439,8 +441,10 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; -template -struct ArrowTypes> { +template +struct ArrowTypes, _s> { + enum class TimeUnit { day, second, milli, micro, nano, string }; + using ArrayType = arrow::TimestampArray; using BuilderType = arrow::TimestampBuilder; @@ -460,23 +464,31 @@ struct ArrowTypes> { } else if (_arr->type()->Equals( arrow::timestamp(arrow::TimeUnit::SECOND))) { - return transform_time_stamp( + return transform_time_stamp( std::static_pointer_cast(_arr)); } else if (_arr->type()->Equals(arrow::timestamp(arrow::TimeUnit::MICRO))) { - return transform_time_stamp( + return transform_time_stamp( std::static_pointer_cast(_arr)); } else if (_arr->type()->Equals(arrow::timestamp(arrow::TimeUnit::NANO))) { - return transform_time_stamp( + return transform_time_stamp( std::static_pointer_cast(_arr)); + } else if (_arr->type()->Equals(arrow::date32())) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + } else if (_arr->type()->Equals(arrow::date64())) { - return transform_time_stamp( + return transform_time_stamp( std::static_pointer_cast(_arr)); + } else if (_arr->type()->Equals(arrow::utf8())) { + return transform_time_stamp( + std::static_pointer_cast(_arr)); + } else { - return error("Expected timestamp or date64 array, got " + + return error("Expected timestamp, date32, date64 or string array, got " + _arr->type()->ToString() + "."); } } @@ -490,7 +502,7 @@ struct ArrowTypes> { return BuilderType(data_type(), arrow::default_memory_pool()); } - template + template static Result> transform_time_stamp( const std::shared_ptr& _arr) noexcept { if (!_arr) { @@ -509,34 +521,45 @@ struct ArrowTypes> { return error(status.message()); } } else { - if constexpr (_unit == arrow::TimeUnit::SECOND) { + if constexpr (_unit == TimeUnit::day) { + const auto status = builder.Append( + static_cast(_arr->Value(i)) * 1000 * 24 * 60 * 60); + if (!status.ok()) { + return error(status.message()); + } + } else if constexpr (_unit == TimeUnit::second) { const auto status = builder.Append(static_cast(_arr->Value(i) * 1000)); if (!status.ok()) { return error(status.message()); } - - } else if constexpr (_unit == arrow::TimeUnit::MILLI) { + } else if constexpr (_unit == TimeUnit::milli) { const auto status = builder.Append(static_cast(_arr->Value(i))); if (!status.ok()) { return error(status.message()); } - - } else if constexpr (_unit == arrow::TimeUnit::MICRO) { + } else if constexpr (_unit == TimeUnit::micro) { const auto status = builder.Append(static_cast(_arr->Value(i) / 1000)); if (!status.ok()) { return error(status.message()); } - - } else if constexpr (_unit == arrow::TimeUnit::NANO) { + } else if constexpr (_unit == TimeUnit::nano) { const auto status = builder.Append(static_cast(_arr->Value(i) / 1000000)); if (!status.ok()) { return error(status.message()); } - + } else if constexpr (_unit == TimeUnit::string) { + const auto ts = Timestamp<_format>::make(std::string(_arr->Value(i))); + if (!ts) { + return error(ts.error().what()); + } + const auto status = builder.Append(ts->to_time_t() * 1000); + if (!status.ok()) { + return error(status.message()); + } } else { static_assert(rfl::always_false_v, "Unsupported time unit."); @@ -551,54 +574,86 @@ struct ArrowTypes> { } }; -template +template +struct ArrowTypes, SerializationType::csv> { + using ArrayType = arrow::TimestampArray; + using BuilderType = arrow::StringBuilder; + + static auto data_type() { return arrow::timestamp(arrow::TimeUnit::MILLI); } + + static void add_to_builder(const Timestamp<_format>& _val, + BuilderType* _builder) { + const auto status = _builder->Append(_val.str()); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result> get_array( + const std::shared_ptr& _arr) { + return ArrowTypes, + SerializationType::parquet>::get_array(_arr); + } + + static Result> get_value(const Ref& _chunk, + const int64_t _ix) { + return ArrowTypes, + SerializationType::parquet>::get_value(_chunk, _ix); + } + + static auto make_builder() { return BuilderType(); } +}; + +template requires internal::has_reflection_type_v -struct ArrowTypes { - using ArrayType = typename ArrowTypes::ArrayType; +struct ArrowTypes { + using ArrayType = + typename ArrowTypes::ArrayType; using BuilderType = - typename ArrowTypes::BuilderType; + typename ArrowTypes::BuilderType; static auto data_type() { - return ArrowTypes::data_type(); + return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { - ArrowTypes::add_to_builder(_val.reflection(), - _builder); + ArrowTypes::add_to_builder( + _val.reflection(), _builder); } static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static Result get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>:: - get_value(_chunk, _ix) - .and_then([](const auto& _v) -> Result { - try { - return T(_v); - } catch (const std::exception& e) { - return error(e.what()); - } - }); + return ArrowTypes, + _s>::get_value(_chunk, _ix) + .and_then([](const auto& _v) -> Result { + try { + return T(_v); + } catch (const std::exception& e) { + return error(e.what()); + } + }); } static auto make_builder() { - return ArrowTypes::make_builder(); + return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { - ArrowTypes::add_to_builder(*_val, _builder); + ArrowTypes::add_to_builder(*_val, _builder); } else { const auto status = _builder->AppendNull(); if (!status.ok()) { @@ -609,27 +664,28 @@ struct ArrowTypes> { static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_optional(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { - ArrowTypes::add_to_builder(*_val, _builder); + ArrowTypes::add_to_builder(*_val, _builder); } else { const auto status = _builder->AppendNull(); if (!status.ok()) { @@ -640,27 +696,28 @@ struct ArrowTypes> { static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_shared(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { - ArrowTypes::add_to_builder(*_val, _builder); + ArrowTypes::add_to_builder(*_val, _builder); } else { const auto status = _builder->AppendNull(); if (!status.ok()) { @@ -671,91 +728,95 @@ struct ArrowTypes> { static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return std::make_unique(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { - ArrowTypes::add_to_builder(*_val, _builder); + ArrowTypes::add_to_builder(*_val, _builder); } static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Box::make(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { - ArrowTypes::add_to_builder(*_val, _builder); + ArrowTypes::add_to_builder(*_val, _builder); } static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Ref::make(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -struct ArrowTypes> { - using ArrayType = typename ArrowTypes>::ArrayType; - using BuilderType = typename ArrowTypes>::BuilderType; +template +struct ArrowTypes, _s> { + using ArrayType = typename ArrowTypes, _s>::ArrayType; + using BuilderType = + typename ArrowTypes, _s>::BuilderType; - static auto data_type() { return ArrowTypes::data_type(); } + static auto data_type() { return ArrowTypes::data_type(); } static void add_to_builder(const auto& _val, BuilderType* _builder) { - ArrowTypes::add_to_builder(_val.value(), _builder); + ArrowTypes::add_to_builder(_val.value(), _builder); } static Result> get_array( const std::shared_ptr& _arr) { - return ArrowTypes::get_array(_arr); + return ArrowTypes::get_array(_arr); } static auto get_value(const Ref& _chunk, const int64_t _ix) { - return ArrowTypes>::get_value(_chunk, _ix) + return ArrowTypes, _s>::get_value(_chunk, _ix) .transform([](const auto& _v) { return Rename<_name, T>(_v); }); } - static auto make_builder() { return ArrowTypes::make_builder(); } + static auto make_builder() { return ArrowTypes::make_builder(); } }; -template -Result::ArrayType>> transform_numerical_array_impl( +template +Result::ArrayType>> +transform_numerical_array_impl( const std::shared_ptr& _arr) noexcept { if (!_arr) { return error( @@ -763,7 +824,7 @@ Result::ArrayType>> transform_numerical_array_impl( "bug, please report."); } - auto builder = ArrowTypes::make_builder(); + auto builder = ArrowTypes::make_builder(); for (int64_t i = 0; i < _arr->length(); ++i) { if (_arr->IsNull(i)) { @@ -779,7 +840,7 @@ Result::ArrayType>> transform_numerical_array_impl( } } - using TargetArrayType = typename ArrowTypes::ArrayType; + using TargetArrayType = typename ArrowTypes::ArrayType; std::shared_ptr res; const auto status = builder.Finish(&res); @@ -787,65 +848,68 @@ Result::ArrayType>> transform_numerical_array_impl( std::static_pointer_cast(res)); } -template -Result::ArrayType>> transform_numerical_array( +template +Result::ArrayType>> transform_numerical_array( const std::shared_ptr& _arr) noexcept { if (!_arr) { return error( "Could not transform the numerical array. std::shared_ptr not set."); } - using ArrayType = typename ArrowTypes::ArrayType; + using ArrayType = typename ArrowTypes::ArrayType; - if (_arr->type()->Equals(ArrowTypes::data_type())) { + if (_arr->type()->Equals(ArrowTypes::data_type())) { return Ref::make(std::static_pointer_cast(_arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>(_arr)); + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>( + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>(_arr)); + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); - } else if (_arr->type()->Equals(ArrowTypes::data_type())) { - return transform_numerical_array_impl( - std::static_pointer_cast::ArrayType>(_arr)); + } else if (_arr->type()->Equals(ArrowTypes::data_type())) { + return transform_numerical_array_impl( + std::static_pointer_cast::ArrayType>( + _arr)); } else { return error("Expected numerical array, got " + _arr->type()->ToString() + diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp index 5deae33d..43c274e0 100644 --- a/include/rfl/parsing/tabular/ArrowWriter.hpp +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -20,7 +20,7 @@ namespace rfl::parsing::tabular { -template +template class ArrowWriter { static_assert(!Processors::add_tags_to_variants_, "rfl::AddTagsToVariants cannot be used for tabular data."); @@ -43,7 +43,7 @@ class ArrowWriter { std::shared_ptr to_table(const VecType& _data) const { return arrow::Table::Make( - make_arrow_schema>(), + make_arrow_schema, _s>(), to_chunked_arrays(_data)); } @@ -55,13 +55,13 @@ class ArrowWriter { size_t chunksize_; }; -template +template std::vector> -ArrowWriter::to_chunked_arrays(const VecType& _data) const { +ArrowWriter::to_chunked_arrays(const VecType& _data) const { using ValueType = typename VecType::value_type; auto builders = - make_arrow_builders>(); + make_arrow_builders, _s>(); constexpr size_t size = tuple_size_v; @@ -78,7 +78,7 @@ ArrowWriter::to_chunked_arrays(const VecType& _data) const { [&](const auto& _v, auto* _b, std::integer_sequence) { - (add_to_builder(*get<_is>(_v), &(_b->template get<_is>())), ...); + (add_to_builder<_s>(*get<_is>(_v), &(_b->template get<_is>())), ...); }(view, &builders, std::make_integer_sequence()); } @@ -103,7 +103,7 @@ ArrowWriter::to_chunked_arrays(const VecType& _data) const { } } - const auto data_types = make_arrow_data_types(); + const auto data_types = make_arrow_data_types(); return [&](std::integer_sequence) { return std::vector>( diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index c22cad1f..776756ea 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -16,13 +16,13 @@ namespace rfl::parsing::tabular { -template +template class ChunkedArrayIterator { public: using difference_type = std::ptrdiff_t; using value_type = Result; - using ArrayType = array_t; + using ArrayType = array_t; static ChunkedArrayIterator make(const Ref& _arr) { return ChunkedArrayIterator(_arr); @@ -48,12 +48,12 @@ class ChunkedArrayIterator { } return current_chunk_.and_then( - [&](const auto& _c) { return ArrowTypes::get_value(_c, ix_); }); + [&](const auto& _c) { return ArrowTypes::get_value(_c, ix_); }); } bool end() const noexcept { return chunk_ix_ >= arr_->num_chunks(); } - ChunkedArrayIterator& operator++() noexcept { + ChunkedArrayIterator& operator++() noexcept { if (!current_chunk_) { return *this; } @@ -72,7 +72,7 @@ class ChunkedArrayIterator { static Result> get_chunk(const Ref& _arr, const int _chunk_ix) noexcept { if (_chunk_ix < _arr->num_chunks()) { - return ArrowTypes::get_array(_arr->chunk(_chunk_ix)); + return ArrowTypes::get_array(_arr->chunk(_chunk_ix)); } else { return error("chunk_ix out of bounds."); } diff --git a/include/rfl/parsing/tabular/add_to_builder.hpp b/include/rfl/parsing/tabular/add_to_builder.hpp index 1d42a22f..d353cbc5 100644 --- a/include/rfl/parsing/tabular/add_to_builder.hpp +++ b/include/rfl/parsing/tabular/add_to_builder.hpp @@ -8,9 +8,10 @@ namespace rfl::parsing::tabular { -template +template inline void add_to_builder(const ValueType& _val, BuilderType* _builder) { - ArrowTypes>::add_to_builder(_val, _builder); + ArrowTypes, _s>::add_to_builder(_val, + _builder); } } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/array_t.hpp b/include/rfl/parsing/tabular/array_t.hpp index 6d083af8..2e8cd7e5 100644 --- a/include/rfl/parsing/tabular/array_t.hpp +++ b/include/rfl/parsing/tabular/array_t.hpp @@ -5,8 +5,8 @@ namespace rfl::parsing::tabular { -template -using array_t = typename ArrowTypes>::ArrayType; +template +using array_t = typename ArrowTypes, _s>::ArrayType; } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/make_arrow_builders.hpp b/include/rfl/parsing/tabular/make_arrow_builders.hpp index 501694ee..33955f34 100644 --- a/include/rfl/parsing/tabular/make_arrow_builders.hpp +++ b/include/rfl/parsing/tabular/make_arrow_builders.hpp @@ -16,40 +16,42 @@ namespace rfl::parsing::tabular { -template -using arrow_builder_t = typename ArrowTypes< - std::remove_cvref_t>>::BuilderType; +template +using arrow_builder_t = + typename ArrowTypes>, + _s>::BuilderType; -template +template struct ArrowBuildersType; -template -struct ArrowBuildersType> { - using Type = Tuple...>; +template +struct ArrowBuildersType, _s> { + using Type = Tuple...>; static auto data_types() { return [&](std::integer_sequence) { return std::array, sizeof...(FieldTypes)>( - {ArrowTypes::data_type()...}); + {ArrowTypes::data_type()...}); }(std::make_integer_sequence()); } static Type make_builders() { - return Type(ArrowTypes::make_builder()...); + return Type(ArrowTypes::make_builder()...); } static auto schema() { - const auto fields = std::vector>( - {arrow::field(typename FieldTypes::Name().str(), - ArrowTypes::data_type())...}); + const auto fields = + std::vector>({arrow::field( + typename FieldTypes::Name().str(), + ArrowTypes::data_type())...}); return arrow::schema(fields); } }; -template +template auto make_arrow_builders() { - return ArrowBuildersType>::make_builders(); + return ArrowBuildersType, _s>::make_builders(); } } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/make_arrow_data_types.hpp b/include/rfl/parsing/tabular/make_arrow_data_types.hpp index 0fb237bd..d153fcfc 100644 --- a/include/rfl/parsing/tabular/make_arrow_data_types.hpp +++ b/include/rfl/parsing/tabular/make_arrow_data_types.hpp @@ -8,9 +8,10 @@ namespace rfl::parsing::tabular { -template +template inline auto make_arrow_data_types() { - return ArrowBuildersType>>::data_types(); + return ArrowBuildersType>, + _s>::data_types(); } } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/make_arrow_schema.hpp b/include/rfl/parsing/tabular/make_arrow_schema.hpp index b9c6268a..8139b451 100644 --- a/include/rfl/parsing/tabular/make_arrow_schema.hpp +++ b/include/rfl/parsing/tabular/make_arrow_schema.hpp @@ -8,9 +8,9 @@ namespace rfl::parsing::tabular { -template +template inline auto make_arrow_schema() { - return ArrowBuildersType>>::schema(); + return ArrowBuildersType>, _s>::schema(); } } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp index 697b8b87..3bd6dc9e 100644 --- a/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp +++ b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp @@ -15,17 +15,17 @@ #include "../../Ref.hpp" #include "../../Result.hpp" #include "../../Tuple.hpp" -#include "ArrowTypes.hpp" #include "ChunkedArrayIterator.hpp" namespace rfl::parsing::tabular { -template +template struct MakeChunkedArrayIterators; -template -struct MakeChunkedArrayIterators> { - using TupleType = Tuple...>; +template +struct MakeChunkedArrayIterators, _s> { + using TupleType = + Tuple...>; Result operator()(const Ref& _table) const { const auto get_column = @@ -40,7 +40,8 @@ struct MakeChunkedArrayIterators> { try { return TupleType( get_column(typename FieldTypes::Name().str()) - .transform(ChunkedArrayIterator::make) + .transform( + ChunkedArrayIterator::make) .value()...); } catch (const std::exception& e) { return error(e.what()); @@ -48,9 +49,9 @@ struct MakeChunkedArrayIterators> { } }; -template +template const auto make_chunked_array_iterators = - MakeChunkedArrayIterators{}; + MakeChunkedArrayIterators{}; } // namespace rfl::parsing::tabular diff --git a/tests/csv/test_save_load.cpp b/tests/csv/test_save_load.cpp index 8cf61965..456c50a0 100644 --- a/tests/csv/test_save_load.cpp +++ b/tests/csv/test_save_load.cpp @@ -16,7 +16,7 @@ struct Person { std::string first_name; std::string last_name = "Simpson"; std::string town = "Springfield"; - // rfl::Timestamp<"%Y-%m-%d"> birthday; + rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; rfl::Email email; }; @@ -24,19 +24,19 @@ struct Person { TEST(csv, test_save_load) { const auto people1 = std::vector({Person{.first_name = "Bart", - //.birthday = "1987-04-19", + .birthday = "1987-04-19", .age = 10, .email = "bart@simpson.com"}, Person{.first_name = "Lisa", - //.birthday = "1987-04-19", + .birthday = "1987-04-19", .age = 8, .email = "lisa@simpson.com"}, Person{.first_name = "Maggie", - //.birthday = "1987-04-19", + .birthday = "1987-04-19", .age = 0, .email = "maggie@simpson.com"}, Person{.first_name = "Homer", - //.birthday = "1987-04-19", + .birthday = "1987-04-19", .age = 45, .email = "homer@simpson.com"}}); From 7c77442feea7164e3f083ead99e78d436a0bdd09 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 17:01:34 +0200 Subject: [PATCH 05/14] Adapt parquet --- include/rfl/parquet/read.hpp | 3 ++- include/rfl/parquet/write.hpp | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp index 68ff2e76..4da4d9f1 100644 --- a/include/rfl/parquet/read.hpp +++ b/include/rfl/parquet/read.hpp @@ -42,7 +42,8 @@ Result> read( return error("Could not read table: " + status.message()); } - using ArrowReader = parsing::tabular::ArrowReader; + using ArrowReader = parsing::tabular::ArrowReader< + T, parsing::tabular::SerializationType::parquet, Ps...>; return ArrowReader::make(table).and_then( [](const auto& _r) { return _r.read(); }); diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index 0dd397af..76757ad8 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -25,7 +25,9 @@ Ref to_buffer(const auto& _arr, const Settings& _settings) { using T = std::remove_cvref_t; const auto table = - parsing::tabular::ArrowWriter(_settings.chunksize) + parsing::tabular::ArrowWriter< + T, parsing::tabular::SerializationType::parquet, Ps...>( + _settings.chunksize) .to_table(_arr); const auto props = ::parquet::WriterProperties::Builder() From 0fccf6820e56d4080171b1711e4bca817f5a3fd2 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 22:25:28 +0200 Subject: [PATCH 06/14] Added settings --- include/rfl/csv/Settings.hpp | 75 +++++++++++++++++++++++++++++-- include/rfl/csv/load.hpp | 8 ++-- include/rfl/csv/read.hpp | 25 ++++++++--- include/rfl/csv/save.hpp | 8 ++-- include/rfl/csv/write.hpp | 11 +++-- tests/csv/test_readme_example.cpp | 4 +- tests/csv/test_save_load.cpp | 6 ++- tests/csv/test_settings.cpp | 44 ++++++++++++++++++ tests/csv/write_and_read.hpp | 2 +- 9 files changed, 158 insertions(+), 25 deletions(-) create mode 100644 tests/csv/test_settings.cpp diff --git a/include/rfl/csv/Settings.hpp b/include/rfl/csv/Settings.hpp index 6875f76a..236abf94 100644 --- a/include/rfl/csv/Settings.hpp +++ b/include/rfl/csv/Settings.hpp @@ -1,6 +1,7 @@ #ifndef RFL_CSV_SETTINGS_HPP_ #define RFL_CSV_SETTINGS_HPP_ +#include #include #include "../Field.hpp" @@ -9,11 +10,77 @@ namespace rfl::csv { struct Settings { - /// The size of the chunks of the csv file. - size_t chunksize = 2000; + /// Maximum number of rows processed at a time. + /// Data is processed in batches of N rows. This number + /// can impact performance. + int32_t batch_size = 1024; - Settings with_chunksize(const size_t _chunksize) const noexcept { - return replace(*this, make_field<"chunksize">(_chunksize)); + /// Field delimiter. + char delimiter = ','; + + /// Whether quoting is used. + bool quoting = true; + + /// Quoting character (if quoting is true). Only relevant for reading. + char quote_char = '"'; + + /// Whether a quote inside a value is double-quoted. Only relevant for + /// reading. + bool double_quote = true; + + /// Whether escaping is used. Only relevant for reading. + bool escaping = false; + + /// Escaping character (if escaping is true). Only relevant for reading. + char escape_char = arrow::csv::kDefaultEscapeChar; + + /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) + /// characters. Only relevant for reading. + bool newlines_in_values = false; + + /// Whether empty lines are ignored. + /// If false, an empty line represents a single empty value (assuming a + /// one-column CSV file). Only relevant for reading. + bool ignore_empty_lines = true; + + Settings with_batch_size(const int32_t _batch_size) const noexcept { + return replace(*this, make_field<"batch_size">(_batch_size)); + } + + Settings with_delimiter(const char _delimiter) const noexcept { + return replace(*this, make_field<"delimiter">(_delimiter)); + } + + Settings with_quoting(const bool _quoting) const noexcept { + return replace(*this, make_field<"quoting">(_quoting)); + } + + Settings with_quote_char(const char _quote_char) const noexcept { + return replace(*this, make_field<"quote_char">(quote_char)); + } + + Settings with_double_quote(const bool _double_quote) const noexcept { + return replace(*this, make_field<"double_quote">(_double_quote)); + } + + Settings with_escaping(const bool _escaping) const noexcept { + return replace(*this, make_field<"escaping">(_escaping)); + } + + Settings with_escape_char(const char _escape_char) const noexcept { + return replace(*this, make_field<"escape_char">(_escape_char)); + } + + Settings with_newlines_in_values( + const bool _newlines_in_values) const noexcept { + return replace(*this, + make_field<"newlines_in_values">(_newlines_in_values)); + } + + Settings with_ignore_empty_lines( + const bool _ignore_empty_lines) const noexcept { + return replace(*this, + make_field<"ignore_empty_lines">(_ignore_empty_lines)); } }; diff --git a/include/rfl/csv/load.hpp b/include/rfl/csv/load.hpp index ecfa06b3..a000070e 100644 --- a/include/rfl/csv/load.hpp +++ b/include/rfl/csv/load.hpp @@ -3,14 +3,16 @@ #include "../Result.hpp" #include "../io/load_string.hpp" +#include "Settings.hpp" #include "read.hpp" namespace rfl::csv { template -Result load(const std::string& _fname) { - const auto read_string = [](const auto& _str) { - return read(_str); +Result load(const std::string& _fname, + const Settings& _settings = Settings{}) { + const auto read_string = [&](const auto& _str) { + return read(_str, _settings); }; return rfl::io::load_string(_fname).and_then(read_string); } diff --git a/include/rfl/csv/read.hpp b/include/rfl/csv/read.hpp index a18bea05..745419f1 100644 --- a/include/rfl/csv/read.hpp +++ b/include/rfl/csv/read.hpp @@ -14,13 +14,15 @@ #include "../concepts.hpp" #include "../internal/wrap_in_rfl_array_t.hpp" #include "../parsing/tabular/ArrowReader.hpp" +#include "Settings.hpp" namespace rfl::csv { /// Parses an object from CSV using reflection. template -Result> read(const char* _str, - const size_t _size) { +Result> read( + const char* _str, const size_t _size, + const Settings& _settings = Settings{}) { arrow::io::IOContext io_context = arrow::io::default_io_context(); const auto buffer = std::make_shared( @@ -30,9 +32,18 @@ Result> read(const char* _str, std::make_shared(buffer); auto read_options = arrow::csv::ReadOptions::Defaults(); - auto parse_options = arrow::csv::ParseOptions::Defaults(); auto convert_options = arrow::csv::ConvertOptions::Defaults(); + auto parse_options = arrow::csv::ParseOptions::Defaults(); + parse_options.delimiter = _settings.delimiter; + parse_options.quoting = _settings.quoting; + parse_options.quote_char = _settings.quote_char; + parse_options.double_quote = _settings.double_quote; + parse_options.escaping = _settings.escaping; + parse_options.escape_char = _settings.escape_char; + parse_options.newlines_in_values = _settings.newlines_in_values; + parse_options.ignore_empty_lines = _settings.ignore_empty_lines; + auto maybe_reader = arrow::csv::TableReader::Make( io_context, input, read_options, parse_options, convert_options); @@ -60,16 +71,16 @@ Result> read(const char* _str, /// Parses an object from CSV using reflection. template -auto read(const std::string_view _str) { - return read(_str.data(), _str.size()); +auto read(const std::string_view _str, const Settings& _settings = Settings{}) { + return read(_str.data(), _str.size(), _settings); } /// Parses an object from a stream. template -auto read(std::istream& _stream) { +auto read(std::istream& _stream, const Settings& _settings = Settings{}) { std::istreambuf_iterator begin(_stream), end; auto bytes = std::vector(begin, end); - return read(bytes.data(), bytes.size()); + return read(bytes.data(), bytes.size(), _settings); } } // namespace rfl::csv diff --git a/include/rfl/csv/save.hpp b/include/rfl/csv/save.hpp index 806fa92f..56dcd928 100644 --- a/include/rfl/csv/save.hpp +++ b/include/rfl/csv/save.hpp @@ -7,14 +7,16 @@ #include "../Result.hpp" #include "../io/save_string.hpp" +#include "Settings.hpp" #include "write.hpp" namespace rfl::csv { template -Result save(const std::string& _fname, const auto& _obj) { - const auto write_func = [](const auto& _obj, auto& _stream) -> auto& { - return write(_obj, _stream); +Result save(const std::string& _fname, const auto& _obj, + const Settings& _settings = Settings{}) { + const auto write_func = [&](const auto& _obj, auto& _stream) -> auto& { + return write(_obj, _stream, _settings); }; return rfl::io::save_string(_fname, _obj, write_func); } diff --git a/include/rfl/csv/write.hpp b/include/rfl/csv/write.hpp index de1d6d16..478a8e14 100644 --- a/include/rfl/csv/write.hpp +++ b/include/rfl/csv/write.hpp @@ -27,7 +27,7 @@ Ref to_buffer(const auto& _arr, const Settings& _settings) { const auto table = parsing::tabular::ArrowWriter(_settings.chunksize) + Ps...>(_settings.batch_size) .to_table(_arr); const auto output_buffer = arrow::io::BufferOutputStream::Create(); @@ -36,9 +36,14 @@ Ref to_buffer(const auto& _arr, const Settings& _settings) { throw std::runtime_error(output_buffer.status().message()); } + auto options = arrow::csv::WriteOptions::Defaults(); + options.batch_size = _settings.batch_size; + options.delimiter = _settings.delimiter; + options.quoting_style = _settings.quoting ? arrow::csv::QuotingStyle::Needed + : arrow::csv::QuotingStyle::None; + const auto status = - arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), - output_buffer.ValueOrDie().get()); + arrow::csv::WriteCSV(*table, options, output_buffer.ValueOrDie().get()); if (!status.ok()) { throw std::runtime_error(status.message()); diff --git a/tests/csv/test_readme_example.cpp b/tests/csv/test_readme_example.cpp index ca0964ed..9bd29186 100644 --- a/tests/csv/test_readme_example.cpp +++ b/tests/csv/test_readme_example.cpp @@ -10,8 +10,8 @@ namespace test_readme_example { using Age = rfl::Validator, rfl::Maximum<130>>; struct Person { - rfl::Rename<"firstName", std::string> first_name; - rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string first_name; + std::string last_name = "Simpson"; std::string town = "Springfield"; rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; diff --git a/tests/csv/test_save_load.cpp b/tests/csv/test_save_load.cpp index 456c50a0..5f1cd69b 100644 --- a/tests/csv/test_save_load.cpp +++ b/tests/csv/test_save_load.cpp @@ -40,10 +40,12 @@ TEST(csv, test_save_load) { .age = 45, .email = "homer@simpson.com"}}); - rfl::csv::save("people.csv", people1); + const auto settings = rfl::csv::Settings{}.with_delimiter(';'); + + rfl::csv::save("people.csv", people1, settings); const auto people2 = - rfl::csv::load>("people.csv").value(); + rfl::csv::load>("people.csv", settings).value(); const auto bytes1 = rfl::csv::write(people1); const auto bytes2 = rfl::csv::write(people2); diff --git a/tests/csv/test_settings.cpp b/tests/csv/test_settings.cpp new file mode 100644 index 00000000..cc228cde --- /dev/null +++ b/tests/csv/test_settings.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_settings { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + std::string last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_settings) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + const auto settings = rfl::csv::Settings{}.with_delimiter(';'); + + write_and_read(people, settings); +} +} // namespace test_settings diff --git a/tests/csv/write_and_read.hpp b/tests/csv/write_and_read.hpp index b9da1944..4640e962 100644 --- a/tests/csv/write_and_read.hpp +++ b/tests/csv/write_and_read.hpp @@ -12,7 +12,7 @@ void write_and_read(const auto& _vec, const rfl::csv::Settings& _settings = rfl::csv::Settings{}) { using T = std::remove_cvref_t; const auto serialized1 = rfl::csv::write(_vec, _settings); - const auto res = rfl::csv::read(serialized1); + const auto res = rfl::csv::read(serialized1, _settings); EXPECT_TRUE(res && true) << "Test failed on read. Error: " << res.error().what(); const auto serialized2 = rfl::csv::write(res.value(), _settings); From 4fc23e6069632a1ca0cd8219c2f0799c3eb6dfbe Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 6 Sep 2025 22:25:40 +0200 Subject: [PATCH 07/14] Added missing settings to parquet::save --- include/rfl/parquet/save.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/rfl/parquet/save.hpp b/include/rfl/parquet/save.hpp index 092abe64..93929f6e 100644 --- a/include/rfl/parquet/save.hpp +++ b/include/rfl/parquet/save.hpp @@ -7,14 +7,16 @@ #include "../Result.hpp" #include "../io/save_bytes.hpp" +#include "Settings.hpp" #include "write.hpp" namespace rfl::parquet { template -Result save(const std::string& _fname, const auto& _obj) { - const auto write_func = [](const auto& _obj, auto& _stream) -> auto& { - return write(_obj, _stream); +Result save(const std::string& _fname, const auto& _obj, + const Settings& _settings = Settings{}) { + const auto write_func = [&](const auto& _obj, auto& _stream) -> auto& { + return write(_obj, _stream, _settings); }; return rfl::io::save_bytes(_fname, _obj, write_func); } From 2def67823f964225383b897475f55fc63c91b324 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 12:19:57 +0200 Subject: [PATCH 08/14] Added support for null strings --- include/rfl/csv/Settings.hpp | 10 +++++++++- include/rfl/csv/read.hpp | 3 +++ include/rfl/csv/write.hpp | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/rfl/csv/Settings.hpp b/include/rfl/csv/Settings.hpp index 236abf94..fa015213 100644 --- a/include/rfl/csv/Settings.hpp +++ b/include/rfl/csv/Settings.hpp @@ -24,6 +24,10 @@ struct Settings { /// Quoting character (if quoting is true). Only relevant for reading. char quote_char = '"'; + /// The string to be used for null values. Quotes are not allowed in this + /// string. + std::string null_string = "n/a"; + /// Whether a quote inside a value is double-quoted. Only relevant for /// reading. bool double_quote = true; @@ -56,7 +60,11 @@ struct Settings { } Settings with_quote_char(const char _quote_char) const noexcept { - return replace(*this, make_field<"quote_char">(quote_char)); + return replace(*this, make_field<"quote_char">(_quote_char)); + } + + Settings with_null_string(const std::string& _null_string) const noexcept { + return replace(*this, make_field<"null_string">(_null_string)); } Settings with_double_quote(const bool _double_quote) const noexcept { diff --git a/include/rfl/csv/read.hpp b/include/rfl/csv/read.hpp index 745419f1..178382f5 100644 --- a/include/rfl/csv/read.hpp +++ b/include/rfl/csv/read.hpp @@ -33,6 +33,9 @@ Result> read( auto read_options = arrow::csv::ReadOptions::Defaults(); auto convert_options = arrow::csv::ConvertOptions::Defaults(); + convert_options.null_values = + std::vector({_settings.null_string}); + convert_options.strings_can_be_null = true; auto parse_options = arrow::csv::ParseOptions::Defaults(); parse_options.delimiter = _settings.delimiter; diff --git a/include/rfl/csv/write.hpp b/include/rfl/csv/write.hpp index 478a8e14..fd8bc353 100644 --- a/include/rfl/csv/write.hpp +++ b/include/rfl/csv/write.hpp @@ -39,6 +39,7 @@ Ref to_buffer(const auto& _arr, const Settings& _settings) { auto options = arrow::csv::WriteOptions::Defaults(); options.batch_size = _settings.batch_size; options.delimiter = _settings.delimiter; + options.null_string = _settings.null_string; options.quoting_style = _settings.quoting ? arrow::csv::QuotingStyle::Needed : arrow::csv::QuotingStyle::None; From 8a84771813508beae9db5df3ea057c3ef71c2c9e Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 12:20:12 +0200 Subject: [PATCH 09/14] Convert utf8 to bytestrings, if necessary --- include/rfl/parsing/tabular/ArrowTypes.hpp | 40 ++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index dcf4e3c5..9e46b2fc 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -426,9 +426,13 @@ struct ArrowTypes { const std::shared_ptr& _arr) { if (_arr->type()->Equals(data_type())) { return Ref::make(std::static_pointer_cast(_arr)); + + } else if (_arr->type()->Equals(arrow::utf8())) { + return Ref::make(std::static_pointer_cast(_arr)); + } else { - return error("Expected binary array, got " + _arr->type()->ToString() + - "."); + return error("Expected binary or string array, got " + + _arr->type()->ToString() + "."); } } @@ -439,6 +443,38 @@ struct ArrowTypes { } static auto make_builder() { return BuilderType(); } + + static Result> transform_string( + const std::shared_ptr& _arr) noexcept { + if (!_arr) { + return error( + "transform_string: std::shared_ptr not set. This is a " + "bug, please report."); + } + + auto builder = arrow::BinaryBuilder(); + + for (int64_t i = 0; i < _arr->length(); ++i) { + if (_arr->IsNull(i)) { + const auto status = builder.AppendNull(); + if (!status.ok()) { + return error(status.message()); + } + } else { + const std::string_view s = _arr->Value(i); + const auto status = builder.Append( + internal::ptr_cast(s.data()), s.size()); + if (!status.ok()) { + return error(status.message()); + } + } + } + + std::shared_ptr res; + const auto status = builder.Finish(&res); + return Ref::make( + std::static_pointer_cast(res)); + } }; template From 244655e61f90577e7c77a9d21654d7f5c69415cd Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 12:20:17 +0200 Subject: [PATCH 10/14] Added more tests --- tests/csv/test_boolean.cpp | 47 ++++++++++++++++++++++++++++++ tests/csv/test_box.cpp | 47 ++++++++++++++++++++++++++++++ tests/csv/test_bytestring.cpp | 24 ++++++++++++++++ tests/csv/test_camel_case.cpp | 42 +++++++++++++++++++++++++++ tests/csv/test_deque.cpp | 42 +++++++++++++++++++++++++++ tests/csv/test_enums.cpp | 44 ++++++++++++++++++++++++++++ tests/csv/test_flatten.cpp | 54 +++++++++++++++++++++++++++++++++++ tests/csv/test_literal.cpp | 42 +++++++++++++++++++++++++++ tests/csv/test_optionals.cpp | 40 ++++++++++++++++++++++++++ tests/csv/test_ref.cpp | 47 ++++++++++++++++++++++++++++++ tests/csv/test_shared_ptr.cpp | 44 ++++++++++++++++++++++++++++ tests/csv/test_unique_ptr.cpp | 44 ++++++++++++++++++++++++++++ 12 files changed, 517 insertions(+) create mode 100644 tests/csv/test_boolean.cpp create mode 100644 tests/csv/test_box.cpp create mode 100644 tests/csv/test_bytestring.cpp create mode 100644 tests/csv/test_camel_case.cpp create mode 100644 tests/csv/test_deque.cpp create mode 100644 tests/csv/test_enums.cpp create mode 100644 tests/csv/test_flatten.cpp create mode 100644 tests/csv/test_literal.cpp create mode 100644 tests/csv/test_optionals.cpp create mode 100644 tests/csv/test_ref.cpp create mode 100644 tests/csv/test_shared_ptr.cpp create mode 100644 tests/csv/test_unique_ptr.cpp diff --git a/tests/csv/test_boolean.cpp b/tests/csv/test_boolean.cpp new file mode 100644 index 00000000..6d9b01c1 --- /dev/null +++ b/tests/csv/test_boolean.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_boolean { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + bool is_child; + Age age; + rfl::Email email; +}; + +TEST(csv, test_boolean) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .is_child = true, + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .is_child = true, + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .is_child = true, + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .is_child = false, + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_boolean diff --git a/tests/csv/test_box.cpp b/tests/csv/test_box.cpp new file mode 100644 index 00000000..1470b416 --- /dev/null +++ b/tests/csv/test_box.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_box { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Box email; +}; + +TEST(csv, test_box) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_box("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_box("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_box("maggie@simpson.com")}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_box("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_box diff --git a/tests/csv/test_bytestring.cpp b/tests/csv/test_bytestring.cpp new file mode 100644 index 00000000..abd7d09c --- /dev/null +++ b/tests/csv/test_bytestring.cpp @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_bytestring { + +struct TestStruct { + rfl::Bytestring bytestring; +}; + +TEST(csv, test_bytestring) { + const auto test_struct = + TestStruct{.bytestring = rfl::Bytestring({std::byte{13}, std::byte{14}, + std::byte{15}, std::byte{16}})}; + + const auto test_structs = std::vector( + {test_struct, test_struct, test_struct, test_struct}); + + write_and_read(test_structs); +} +} // namespace test_bytestring diff --git a/tests/csv/test_camel_case.cpp b/tests/csv/test_camel_case.cpp new file mode 100644 index 00000000..6d72baaf --- /dev/null +++ b/tests/csv/test_camel_case.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_camel_case { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + std::string last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_camel_case) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_camel_case diff --git a/tests/csv/test_deque.cpp b/tests/csv/test_deque.cpp new file mode 100644 index 00000000..d863ae69 --- /dev/null +++ b/tests/csv/test_deque.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_deque { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_deque) { + const auto people = + std::deque({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_deque diff --git a/tests/csv/test_enums.cpp b/tests/csv/test_enums.cpp new file mode 100644 index 00000000..17af9078 --- /dev/null +++ b/tests/csv/test_enums.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_enums { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_enums) { + const auto people = + std::vector({Person{.first_name = FirstName::Bart, + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = FirstName::Homer, + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_enums diff --git a/tests/csv/test_flatten.cpp b/tests/csv/test_flatten.cpp new file mode 100644 index 00000000..2e0ff859 --- /dev/null +++ b/tests/csv/test_flatten.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_flatten { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Address { + std::string street; + std::string city; +}; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; + rfl::Flatten
address; +}; + +TEST(csv, test_flatten) { + const auto address = + Address{.street = "Evergreen Terrace", .city = "Springfield"}; + + const auto people = std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com", + .address = address}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com", + .address = address}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com", + .address = address}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com", + .address = address}}); + + write_and_read(people); +} +} // namespace test_flatten diff --git a/tests/csv/test_literal.cpp b/tests/csv/test_literal.cpp new file mode 100644 index 00000000..a5f74397 --- /dev/null +++ b/tests/csv/test_literal.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_literal { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + rfl::Literal<"Simpson"> last_name; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(csv, test_literal) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_literal diff --git a/tests/csv/test_optionals.cpp b/tests/csv/test_optionals.cpp new file mode 100644 index 00000000..3c99df9f --- /dev/null +++ b/tests/csv/test_optionals.cpp @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_optionals { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::optional email; +}; + +TEST(csv, test_optionals) { + const auto people = std::vector( + {Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_optionals diff --git a/tests/csv/test_ref.cpp b/tests/csv/test_ref.cpp new file mode 100644 index 00000000..2bf21901 --- /dev/null +++ b/tests/csv/test_ref.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_ref { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Ref email; +}; + +TEST(csv, test_ref) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_ref("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_ref("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_ref("maggie@simpson.com")}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_ref("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_ref diff --git a/tests/csv/test_shared_ptr.cpp b/tests/csv/test_shared_ptr.cpp new file mode 100644 index 00000000..72cab308 --- /dev/null +++ b/tests/csv/test_shared_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_shared_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::shared_ptr email; +}; + +TEST(csv, test_shared_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_shared("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_shared("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_shared("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_shared_ptr diff --git a/tests/csv/test_unique_ptr.cpp b/tests/csv/test_unique_ptr.cpp new file mode 100644 index 00000000..b41cb1fa --- /dev/null +++ b/tests/csv/test_unique_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_unique_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::unique_ptr email; +}; + +TEST(csv, test_unique_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_unique("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_unique("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_unique("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_unique_ptr From 11c2707c529ba2b94023394ee83ae0ca8f97a857 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 12:39:46 +0200 Subject: [PATCH 11/14] Added documentation for CSV --- docs/supported_formats/csv.md | 218 ++++++++++++++++++++++++++++++ docs/supported_formats/parquet.md | 2 + mkdocs.yaml | 2 + 3 files changed, 222 insertions(+) create mode 100644 docs/supported_formats/csv.md diff --git a/docs/supported_formats/csv.md b/docs/supported_formats/csv.md new file mode 100644 index 00000000..d8933f88 --- /dev/null +++ b/docs/supported_formats/csv.md @@ -0,0 +1,218 @@ +# csv + +For CSV support, include the header `` and link to the [Apache Arrow](https://arrow.apache.org/) library. +Furthermore, when compiling reflect-cpp, you need to pass `-DREFLECTCPP_CSV=ON` to cmake. + +CSV is a tabular text format. Like other tabular formats in reflect-cpp, CSV is designed for collections of flat records and has limitations for nested or variant types. + +## Reading and writing + +Suppose you have a struct like this: + +```cpp +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + unsigned int age; + rfl::Email email; +}; +``` + +Important: CSV is a tabular format that requires collections of records. You cannot serialize individual structs - you must use containers like `std::vector`, `std::deque`, etc. + +Write a collection to a string (CSV bytes) like this: + +```cpp +const auto people = std::vector{ + Person{.first_name = "Bart", .birthday = "1987-04-19", .age = 10, .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", .birthday = "1987-04-19", .age = 8, .email = "lisa@simpson.com"} +}; + +const std::string csv_text = rfl::csv::write(people); +``` + +Parse from a string or bytes view: + +```cpp +const rfl::Result> result = rfl::csv::read>(csv_text); +``` + +## Settings + +CSV behavior can be configured using `rfl::csv::Settings`: + +```cpp +const auto settings = rfl::csv::Settings{} + .with_delimiter(';') + .with_quoting(true) + .with_quote_char('"') + .with_null_string("n/a") + .with_double_quote(true) + .with_escaping(false) + .with_escape_char('\\') + .with_newlines_in_values(false) + .with_ignore_empty_lines(true) + .with_batch_size(1024); + +const std::string csv_text = rfl::csv::write(people, settings); +``` + +Key options: +- `batch_size` - Maximum number of rows processed per batch (performance tuning) +- `delimiter` - Field delimiter character +- `quoting` - Whether to use quoting when writing +- `quote_char` - Quote character used when reading +- `null_string` - String representation for null values +- `double_quote` - Whether a quote inside a value is double-quoted (reading) +- `escaping` - Whether escaping is used (reading) +- `escape_char` - Escape character (reading) +- `newlines_in_values` - Whether CR/LF are allowed inside values (reading) +- `ignore_empty_lines` - Whether empty lines are ignored (reading) + +## Loading and saving + +You can load from and save to disk: + +```cpp +const rfl::Result> result = rfl::csv::load>("/path/to/file.csv"); + +const auto people = std::vector{...}; +rfl::csv::save("/path/to/file.csv", people); +``` + +With custom settings: + +```cpp +const auto settings = rfl::csv::Settings{}.with_delimiter(';'); +rfl::csv::save("/path/to/file.csv", people, settings); +``` + +## Reading from and writing into streams + +You can read from any `std::istream` and write to any `std::ostream`: + +```cpp +const rfl::Result> result = rfl::csv::read>(my_istream); + +const auto people = std::vector{...}; +rfl::csv::write(people, my_ostream); +``` + +With custom settings: + +```cpp +const auto settings = rfl::csv::Settings{}.with_delimiter(';'); +rfl::csv::write(people, my_ostream, settings); +``` + +## Field name transformations + +Like other formats, CSV supports field name transformations via processors, e.g. `SnakeCaseToCamelCase`: + +```cpp +const auto people = std::vector{...}; +const auto result = rfl::csv::read, rfl::SnakeCaseToCamelCase>(csv_text); +``` + +## Enums and validation + +CSV supports enums and validated types. Enums are written/read as strings: + +```cpp +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + rfl::Validator, rfl::Maximum<130>> age; + rfl::Email email; +}; +``` + +## Limitations of tabular formats + +CSV, like other tabular formats, has limitations compared to hierarchical formats such as JSON or XML: + +### Collections requirement +You must serialize collections, not individual objects: +```cpp +std::vector people = {...}; // ✅ Correct +Person person = {...}; // ❌ Wrong - must be in a container +``` + +### No nested objects +Each field must be a primitive type, enum, or a simple validated type. Nested objects are not automatically flattened: +```cpp +// This would NOT work as expected - nested objects are not automatically flattened +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + Address address; // ❌ Will cause compilation errors for CSV +}; +``` + +### Using rfl::Flatten for nested objects +If you need to include nested objects, use `rfl::Flatten` to explicitly flatten them: +```cpp +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + rfl::Flatten
address; // ✅ This will flatten the Address fields +}; + +// The resulting CSV will have columns: first_name, last_name, street, city +``` + +### No variant types +Variant types like `std::variant`, `rfl::Variant`, or `rfl::TaggedUnion` cannot be serialized to CSV as separate columns: +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::variant status; // Variant - not supported + rfl::Variant type; // rfl::Variant - not supported + rfl::TaggedUnion<"type", std::string, int> category; // TaggedUnion - not supported +}; +``` + +### No arrays (except bytestrings) +CSV output here does not support arrays (lists) of values in a single column. The only array-like field supported is binary data represented as bytestrings: +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::vector hobbies; // Array of strings - not supported + std::vector scores; // Array of integers - not supported + std::vector
addresses; // Array of objects - not supported +}; + +// ✅ This works +struct Blob { + std::vector binary_data; // Binary data supported as bytestring +}; +``` + +### Use cases +CSV is ideal for: +- Data exchange and interoperability +- Simple, flat data structures with consistent types +- Human-readable datasets + +CSV is less suitable for: +- Complex nested data structures +- Data with arrays or variant types +- Strict schemas with evolving types +- Very large datasets where binary columnar formats are preferred + diff --git a/docs/supported_formats/parquet.md b/docs/supported_formats/parquet.md index f259b3ce..0cba3e1b 100644 --- a/docs/supported_formats/parquet.md +++ b/docs/supported_formats/parquet.md @@ -1,3 +1,5 @@ +# parquet + For Parquet support, you must also include the header `` and link to the [Apache Arrow](https://arrow.apache.org/) and [Apache Parquet](https://parquet.apache.org/) libraries. Furthermore, when compiling reflect-cpp, you need to pass `-DREFLECTCPP_PARQUET=ON` to cmake. diff --git a/mkdocs.yaml b/mkdocs.yaml index ae77201f..38aefa22 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -96,9 +96,11 @@ nav: - BSON: supported_formats/bson.md - Cap'n Proto: supported_formats/capnproto.md - CBOR: supported_formats/cbor.md + - CSV: supported_formats/csv.md - FlexBuffers: supported_formats/flexbuffers.md - JSON: supported_formats/json.md - MessagePack: supported_formats/msgpack.md + - Parquet: supported_formats/parquet.md - TOML: supported_formats/toml.md - UBJSON: supported_formats/ubjson.md - XML: supported_formats/xml.md From 80c5715bfee3c46ad18669c3cbba2fda8575ccd7 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 13:12:54 +0200 Subject: [PATCH 12/14] Removed noexcept --- include/rfl/csv/write.hpp | 2 +- include/rfl/parquet/write.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rfl/csv/write.hpp b/include/rfl/csv/write.hpp index fd8bc353..6f9bd931 100644 --- a/include/rfl/csv/write.hpp +++ b/include/rfl/csv/write.hpp @@ -70,7 +70,7 @@ std::string write(const auto& _arr, const Settings& _settings = Settings{}) { /// Writes a CSV into an ostream. template std::ostream& write(const auto& _arr, std::ostream& _stream, - const Settings& _settings = Settings{}) noexcept { + const Settings& _settings = Settings{}) { auto buffer = to_buffer(_arr, _settings); _stream << std::string_view(*buffer); return _stream; diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index 76757ad8..d39147e3 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -72,7 +72,7 @@ std::vector write(const auto& _arr, /// Writes a PARQUET into an ostream. template std::ostream& write(const auto& _arr, std::ostream& _stream, - const Settings& _settings = Settings{}) noexcept { + const Settings& _settings = Settings{}) { auto buffer = to_buffer(_arr, _settings); _stream << std::string_view(*buffer); return _stream; From 7ff99dcd803d1e3a161c88a24dea6de1b09f7eb2 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 13:13:10 +0200 Subject: [PATCH 13/14] Properly use transform_string --- include/rfl/parsing/tabular/ArrowTypes.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index 9e46b2fc..26acc3fc 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -428,7 +428,8 @@ struct ArrowTypes { return Ref::make(std::static_pointer_cast(_arr)); } else if (_arr->type()->Equals(arrow::utf8())) { - return Ref::make(std::static_pointer_cast(_arr)); + return transform_string( + std::static_pointer_cast(_arr)); } else { return error("Expected binary or string array, got " + From c5b7536d7dc27af901efaca169f580dbc2bf6122 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 7 Sep 2025 13:13:21 +0200 Subject: [PATCH 14/14] Minor improvements in the documentation --- docs/supported_formats/csv.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/supported_formats/csv.md b/docs/supported_formats/csv.md index d8933f88..0065ea67 100644 --- a/docs/supported_formats/csv.md +++ b/docs/supported_formats/csv.md @@ -11,8 +11,8 @@ Suppose you have a struct like this: ```cpp struct Person { - rfl::Rename<"firstName", std::string> first_name; - rfl::Rename<"lastName", std::string> last_name; + std::string first_name; + std::string last_name = "Simpson"; rfl::Timestamp<"%Y-%m-%d"> birthday; unsigned int age; rfl::Email email;