From 0c640544fc484721812267978dd90346264ea000 Mon Sep 17 00:00:00 2001 From: Trent Feda <36749299+tfeda@users.noreply.github.com> Date: Tue, 26 Jul 2022 11:20:55 -0400 Subject: [PATCH] Improve `validate_utf8` performance (#2048) * added utf8 validation bench * improve utf8 validation performance * fix bench clippy errors * Add is_char_boundary() to utf8 validation --- arrow/benches/array_data_validate.rs | 15 ++++++++-- arrow/src/array/data.rs | 41 +++++++++++++++++++++------- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs index c46252becec..3cd13c09c58 100644 --- a/arrow/benches/array_data_validate.rs +++ b/arrow/benches/array_data_validate.rs @@ -37,11 +37,22 @@ fn create_binary_array_data(length: i32) -> ArrayData { .unwrap() } -fn array_slice_benchmark(c: &mut Criterion) { +fn validate_utf8_array(arr: &StringArray) { + arr.data().validate_values().unwrap(); +} + +fn validate_benchmark(c: &mut Criterion) { + //Binary Array c.bench_function("validate_binary_array_data 20000", |b| { b.iter(|| create_binary_array_data(20000)) }); + + //Utf8 Array + let str_arr = StringArray::from(vec!["test"; 20000]); + c.bench_function("validate_utf8_array_data 20000", |b| { + b.iter(|| validate_utf8_array(&str_arr)) + }); } -criterion_group!(benches, array_slice_benchmark); +criterion_group!(benches, validate_benchmark); criterion_main!(benches); diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 4ae7f069e2d..c38107b2587 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1141,16 +1141,37 @@ impl ArrayData { T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, { let values_buffer = &self.buffers[1].as_slice(); - - self.validate_each_offset::(values_buffer.len(), |string_index, range| { - std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Invalid UTF8 sequence at string index {} ({:?}): {}", - string_index, range, e - )) - })?; - Ok(()) - }) + if let Ok(values_str) = std::str::from_utf8(values_buffer) { + // Validate Offsets are correct + self.validate_each_offset::( + values_buffer.len(), + |string_index, range| { + if !values_str.is_char_boundary(range.start) + || !values_str.is_char_boundary(range.end) + { + return Err(ArrowError::InvalidArgumentError(format!( + "incomplete utf-8 byte sequence from index {}", + string_index + ))); + } + Ok(()) + }, + ) + } else { + // find specific offset that failed utf8 validation + self.validate_each_offset::( + values_buffer.len(), + |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {} ({:?}): {}", + string_index, range, e + )) + })?; + Ok(()) + }, + ) + } } /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are