Skip to content

over/underflow when decoding patched base integer when decoding RLEv2 #63

@progval

Description

@progval

When reading this file (5GB) with this code:

use std::sync::Arc;
use std::fs::File;

use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Schema};
use orc_rust::arrow_reader::ArrowReaderBuilder;
use orc_rust::projection::ProjectionMask;

pub(crate) const ORC_BATCH_SIZE: usize = 100;

fn main() {
    let file = File::open("/srv/softwareheritage/ssd/data/vlorentz/datasets/2025-10-08/orc/revision/revision-88555803-0982-45ef-bd16-5e1b6a45cb67.orc").expect("could not open .orc");
    let reader_builder = ArrowReaderBuilder::try_new(file).expect("Could not make builder");

    let field_names = vec!["committer_offset"];
    let projection = ProjectionMask::named_roots(
        reader_builder.file_metadata().root_data_type(),
        field_names.as_slice(),
    );
    let reader_builder = reader_builder
        .with_projection(projection)
        .with_batch_size(ORC_BATCH_SIZE);

    let reader = reader_builder.build();

    let num_rows = reader.total_row_count();
    let mut i = 0;
    for chunk in reader {
        let chunk: arrow::array::RecordBatch =
            chunk.unwrap_or_else(|e| panic!("Could not read chunk (after {} rows, from ORC file of {num_rows} rows): {e}", i));
        i += chunk.num_rows();
    }

}

and this patch:

diff --git a/src/encoding/integer/rle_v2/patched_base.rs b/src/encoding/integer/rle_v2/patched_base.rs
index a6782bb..99bf309 100644
--- a/src/encoding/integer/rle_v2/patched_base.rs
+++ b/src/encoding/integer/rle_v2/patched_base.rs
@@ -116,8 +116,9 @@ pub fn read_patched_base<N: NInt, R: Read, S: EncodingSign>(
             let patch_bits = N::from_i64(patch_bits);
             let patched_value = *value | patch_bits;
 
+            println!("N={} S={} value={value} patched_value={patched_value} patch_bits={patch_bits} base={base}", std::any::type_name::<N>(), std::any::type_name::<S>());
             *value = patched_value.checked_add(&base).context(OutOfSpecSnafu {
-                msg: "over/underflow when decoding patched base integer",
+                msg: "over/underflow when decoding patched base integer C",
             })?;
 
             patch_index += 1;
@@ -139,7 +140,7 @@ pub fn read_patched_base<N: NInt, R: Read, S: EncodingSign>(
             }
         } else {
             *value = value.checked_add(&base).context(OutOfSpecSnafu {
-                msg: "over/underflow when decoding patched base integer",
+                msg: "over/underflow when decoding patched base integer D",
             })?;
         }
     }

errors with:

N=i16 S=orc_rust::encoding::integer::SignedEncoding value=268 patched_value=780 patch_bits=512 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=268 patched_value=780 patch_bits=512 base=-240
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=448 patched_value=960 patch_bits=512 base=-240
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=96 patched_value=480 patch_bits=384 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=118 patched_value=630 patch_bits=512 base=-300
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=176 patched_value=1200 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=28 patched_value=540 patch_bits=512 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=44 patched_value=300 patch_bits=256 base=0
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=56 patched_value=1080 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=328 patched_value=840 patch_bits=512 base=-300
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=56 patched_value=1080 patch_bits=1024 base=-420
N=i16 S=orc_rust::encoding::integer::SignedEncoding value=23392 patched_value=-9376 patch_bits=-32768 base=-25080

thread 'main' panicked at examples/swh.rs:30:38:
Could not read chunk (after 17972064 rows, from ORC file of 42507953 rows): External error: Out of spec, message: over/underflow when decoding patched base integer C
stack backtrace:
   0: rust_begin_unwind
   1: core::panicking::panic_fmt
   2: swh::main
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

but Spark has no issue with it. Tested with:

SELECT committer_offset
FROM orc.`/srv/softwareheritage/ssd/data/vlorentz/datasets/2025-10-08/orc/revision/revision-88555803-0982-45ef-bd16-5e1b6a45cb67.orc`
LIMIT 100 OFFSET 17972064;

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions