From fff55ac8311207cbc2100a54909376e39be1f5d7 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Wed, 7 May 2025 11:23:48 +0800 Subject: [PATCH 1/5] update Arrow to 56.0.0 --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d563db9..5308c2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,10 +32,10 @@ rust-version = "1.73" all-features = true [dependencies] -arrow = { version = ">= 53.1.0, < 55.0.0", features = ["prettyprint", "chrono-tz", "ipc_compression"] } +arrow = { version = "56.0", features = ["prettyprint", "chrono-tz", "ipc_compression"] } bytemuck = { version = "1.18.0", features = ["must_cast"] } bytes = "1.4" -chrono = { version = ">= 0.4.37, < 0.4.40", default-features = false, features = ["std"] } +chrono = { version = "0.4.41", default-features = false, features = ["std"] } chrono-tz = "0.10" fallible-streaming-iterator = { version = "0.1" } flate2 = "1" @@ -68,8 +68,8 @@ clap = { version = "4.5.4", features = ["derive"], optional = true } opendal = { version = "0.50", optional = true, default-features = false } [dev-dependencies] -arrow-ipc = { version = "53.0.0", features = ["lz4"] } -arrow-json = "53.0.0" +arrow-ipc = { version = "56.0", features = ["lz4"] } +arrow-json = "56.0" criterion = { version = "0.5", default-features = false, features = ["async_tokio"] } opendal = { version = "0.50", default-features = false, features = ["services-memory"] } pretty_assertions = "1.3.0" From 24caf3c14e7714ab1612fc2946002e4c3a79545d Mon Sep 17 00:00:00 2001 From: luofucong Date: Thu, 14 Aug 2025 19:29:41 +0800 Subject: [PATCH 2/5] fix ci --- format/orc_proto.proto | 23 ----------------------- src/arrow_writer.rs | 2 +- src/encoding/integer/mod.rs | 1 + src/proto.rs | 35 ----------------------------------- 4 files changed, 2 insertions(+), 59 deletions(-) diff --git a/format/orc_proto.proto b/format/orc_proto.proto index ff71659..e899912 100644 --- a/format/orc_proto.proto +++ b/format/orc_proto.proto @@ -100,24 +100,12 @@ message ColumnStatistics { optional CollectionStatistics collectionStatistics = 12; } -message RowIndexEntry { - repeated uint64 positions = 1 [packed=true]; - optional ColumnStatistics statistics = 2; -} - -message RowIndex { - repeated RowIndexEntry entry = 1; -} - message BloomFilter { optional uint32 numHashFunctions = 1; repeated fixed64 bitset = 2; optional bytes utf8bitset = 3; } -message BloomFilterIndex { - repeated BloomFilter bloomFilter = 1; -} message Stream { // if you add new index stream kinds, you need to make sure to update @@ -270,23 +258,12 @@ message Metadata { repeated StripeStatistics stripeStats = 1; } -// In ORC v2 (and for encrypted columns in v1), each column has -// their column statistics written separately. -message ColumnarStripeStatistics { - // one value for each stripe in the file - repeated ColumnStatistics colStats = 1; -} - enum EncryptionAlgorithm { UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms AES_CTR_128 = 1; AES_CTR_256 = 2; } -message FileStatistics { - repeated ColumnStatistics column = 1; -} - // How was the data masked? This isn't necessary for reading the file, but // is documentation about how the file was written. message DataMask { diff --git a/src/arrow_writer.rs b/src/arrow_writer.rs index 0b4085d..e322493 100644 --- a/src/arrow_writer.rs +++ b/src/arrow_writer.rs @@ -350,7 +350,7 @@ mod tests { ) .unwrap(); - let rows = roundtrip(&[batch.clone()]); + let rows = roundtrip(std::slice::from_ref(&batch)); assert_eq!(batch, rows[0]); } diff --git a/src/encoding/integer/mod.rs b/src/encoding/integer/mod.rs index f652d4e..be8919b 100644 --- a/src/encoding/integer/mod.rs +++ b/src/encoding/integer/mod.rs @@ -39,6 +39,7 @@ use crate::{ use super::PrimitiveValueDecoder; +#[allow(unused)] pub mod rle_v1; pub mod rle_v2; mod util; diff --git a/src/proto.rs b/src/proto.rs index ae71cdb..c425e5b 100644 --- a/src/proto.rs +++ b/src/proto.rs @@ -145,20 +145,6 @@ pub struct ColumnStatistics { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct RowIndexEntry { - #[prost(uint64, repeated, tag = "1")] - pub positions: ::prost::alloc::vec::Vec, - #[prost(message, optional, tag = "2")] - pub statistics: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct RowIndex { - #[prost(message, repeated, tag = "1")] - pub entry: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] pub struct BloomFilter { #[prost(uint32, optional, tag = "1")] pub num_hash_functions: ::core::option::Option, @@ -169,12 +155,6 @@ pub struct BloomFilter { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct BloomFilterIndex { - #[prost(message, repeated, tag = "1")] - pub bloom_filter: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] pub struct Stream { #[prost(enumeration = "stream::Kind", optional, tag = "1")] pub kind: ::core::option::Option, @@ -499,21 +479,6 @@ pub struct Metadata { #[prost(message, repeated, tag = "1")] pub stripe_stats: ::prost::alloc::vec::Vec, } -/// In ORC v2 (and for encrypted columns in v1), each column has -/// their column statistics written separately. -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ColumnarStripeStatistics { - /// one value for each stripe in the file - #[prost(message, repeated, tag = "1")] - pub col_stats: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FileStatistics { - #[prost(message, repeated, tag = "1")] - pub column: ::prost::alloc::vec::Vec, -} /// How was the data masked? This isn't necessary for reading the file, but /// is documentation about how the file was written. #[allow(clippy::derive_partial_eq_without_eq)] From 92231ca5fae5c14d07bb93d31f2fdcc203c5024d Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 15 Aug 2025 12:48:25 +0800 Subject: [PATCH 3/5] Update orc_proto.proto --- format/orc_proto.proto | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/format/orc_proto.proto b/format/orc_proto.proto index e899912..ff71659 100644 --- a/format/orc_proto.proto +++ b/format/orc_proto.proto @@ -100,12 +100,24 @@ message ColumnStatistics { optional CollectionStatistics collectionStatistics = 12; } +message RowIndexEntry { + repeated uint64 positions = 1 [packed=true]; + optional ColumnStatistics statistics = 2; +} + +message RowIndex { + repeated RowIndexEntry entry = 1; +} + message BloomFilter { optional uint32 numHashFunctions = 1; repeated fixed64 bitset = 2; optional bytes utf8bitset = 3; } +message BloomFilterIndex { + repeated BloomFilter bloomFilter = 1; +} message Stream { // if you add new index stream kinds, you need to make sure to update @@ -258,12 +270,23 @@ message Metadata { repeated StripeStatistics stripeStats = 1; } +// In ORC v2 (and for encrypted columns in v1), each column has +// their column statistics written separately. +message ColumnarStripeStatistics { + // one value for each stripe in the file + repeated ColumnStatistics colStats = 1; +} + enum EncryptionAlgorithm { UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms AES_CTR_128 = 1; AES_CTR_256 = 2; } +message FileStatistics { + repeated ColumnStatistics column = 1; +} + // How was the data masked? This isn't necessary for reading the file, but // is documentation about how the file was written. message DataMask { From 97c9ce82c6d3a7c3834007604e61d458cbf659c1 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 15 Aug 2025 12:49:03 +0800 Subject: [PATCH 4/5] Update mod.rs --- src/encoding/integer/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/encoding/integer/mod.rs b/src/encoding/integer/mod.rs index be8919b..f652d4e 100644 --- a/src/encoding/integer/mod.rs +++ b/src/encoding/integer/mod.rs @@ -39,7 +39,6 @@ use crate::{ use super::PrimitiveValueDecoder; -#[allow(unused)] pub mod rle_v1; pub mod rle_v2; mod util; From cd9054a47ff51aa2a898c3395917650a06e35f37 Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Fri, 15 Aug 2025 12:52:04 +0800 Subject: [PATCH 5/5] Update proto.rs --- src/proto.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/proto.rs b/src/proto.rs index c425e5b..ae71cdb 100644 --- a/src/proto.rs +++ b/src/proto.rs @@ -145,6 +145,20 @@ pub struct ColumnStatistics { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct RowIndexEntry { + #[prost(uint64, repeated, tag = "1")] + pub positions: ::prost::alloc::vec::Vec, + #[prost(message, optional, tag = "2")] + pub statistics: ::core::option::Option, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RowIndex { + #[prost(message, repeated, tag = "1")] + pub entry: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct BloomFilter { #[prost(uint32, optional, tag = "1")] pub num_hash_functions: ::core::option::Option, @@ -155,6 +169,12 @@ pub struct BloomFilter { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct BloomFilterIndex { + #[prost(message, repeated, tag = "1")] + pub bloom_filter: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Stream { #[prost(enumeration = "stream::Kind", optional, tag = "1")] pub kind: ::core::option::Option, @@ -479,6 +499,21 @@ pub struct Metadata { #[prost(message, repeated, tag = "1")] pub stripe_stats: ::prost::alloc::vec::Vec, } +/// In ORC v2 (and for encrypted columns in v1), each column has +/// their column statistics written separately. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnarStripeStatistics { + /// one value for each stripe in the file + #[prost(message, repeated, tag = "1")] + pub col_stats: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FileStatistics { + #[prost(message, repeated, tag = "1")] + pub column: ::prost::alloc::vec::Vec, +} /// How was the data masked? This isn't necessary for reading the file, but /// is documentation about how the file was written. #[allow(clippy::derive_partial_eq_without_eq)]