-
Notifications
You must be signed in to change notification settings - Fork 16
Closed
Description
When reading this file (5GB) with this code:
use std::sync::Arc;
use std::fs::File;
use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Schema};
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;
pub(crate) const ORC_BATCH_SIZE: usize = 100;
fn main() {
let file = File::open("/srv/softwareheritage/ssd/data/vlorentz/datasets/2025-10-08/orc/revision/revision-88555803-0982-45ef-bd16-5e1b6a45cb67.orc").expect("could not open .orc");
let reader_builder = ArrowReaderBuilder::try_new(file).expect("Could not make builder");
let field_names = vec!["committer_offset"];
let projection = ProjectionMask::named_roots(
reader_builder.file_metadata().root_data_type(),
field_names.as_slice(),
);
let reader_builder = reader_builder
.with_projection(projection)
.with_batch_size(ORC_BATCH_SIZE);
let reader = reader_builder.build();
let num_rows = reader.total_row_count();
let mut i = 0;
for chunk in reader {
let chunk: arrow::array::RecordBatch =
chunk.unwrap_or_else(|e| panic!("Could not read chunk (after {} rows, from ORC file of {num_rows} rows): {e}", i));
i += chunk.num_rows();
}
}and this patch:
diff --git a/src/encoding/integer/rle_v2/patched_base.rs b/src/encoding/integer/rle_v2/patched_base.rs
index a6782bb..99bf309 100644
--- a/src/encoding/integer/rle_v2/patched_base.rs
+++ b/src/encoding/integer/rle_v2/patched_base.rs
@@ -116,8 +116,9 @@ pub fn read_patched_base<N: NInt, R: Read, S: EncodingSign>(
let patch_bits = N::from_i64(patch_bits);
let patched_value = *value | patch_bits;
+ println!("N={} S={} value={value} patched_value={patched_value} patch_bits={patch_bits} base={base}", std::any::type_name::<N>(), std::any::type_name::<S>());
*value = patched_value.checked_add(&base).context(OutOfSpecSnafu {
- msg: "over/underflow when decoding patched base integer",
+ msg: "over/underflow when decoding patched base integer C",
})?;
patch_index += 1;
@@ -139,7 +140,7 @@ pub fn read_patched_base<N: NInt, R: Read, S: EncodingSign>(
}
} else {
*value = value.checked_add(&base).context(OutOfSpecSnafu {
- msg: "over/underflow when decoding patched base integer",
+ msg: "over/underflow when decoding patched base integer D",
})?;
}
}errors with:
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=268 patched_value=780 patch_bits=512 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=268 patched_value=780 patch_bits=512 base=-240
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=448 patched_value=960 patch_bits=512 base=-240
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=96 patched_value=480 patch_bits=384 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=118 patched_value=630 patch_bits=512 base=-300
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=176 patched_value=1200 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=28 patched_value=540 patch_bits=512 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=44 patched_value=300 patch_bits=256 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=56 patched_value=1080 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=328 patched_value=840 patch_bits=512 base=-300
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=56 patched_value=1080 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=23392 patched_value=-9376 patch_bits=-32768 base=-25080
thread 'main' panicked at examples/swh.rs:30:38:
Could not read chunk (after 17972064 rows, from ORC file of 42507953 rows): External error: Out of spec, message: over/underflow when decoding patched base integer C
stack backtrace:
0: rust_begin_unwind
1: core::panicking::panic_fmt
2: swh::main
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
but Spark has no issue with it. Tested with:
SELECT committer_offset
FROM orc.`/srv/softwareheritage/ssd/data/vlorentz/datasets/2025-10-08/orc/revision/revision-88555803-0982-45ef-bd16-5e1b6a45cb67.orc`
LIMIT 100 OFFSET 17972064;Metadata
Metadata
Assignees
Labels
No labels