Skip to content

Commit

Permalink
fix: Trim ".0" postfix when converting Float to Utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
MazterQyou committed Mar 4, 2024
1 parent 9f2e286 commit 8a5e7e0
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 7 deletions.
2 changes: 1 addition & 1 deletion arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ flatbuffers = { version = "=2.1.1", optional = true }
hex = "0.4"
comfy-table = { version = "5.0", optional = true, default-features = false }
pyo3 = { version = "0.16", optional = true }
lexical-core = "^0.8"
lexical-core = { version = "^0.8", features = ["format"] }
multiversion = "0.6.1"
bitflags = "1.2.1"

Expand Down
43 changes: 37 additions & 6 deletions arrow/src/compute/kernels/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::{array::*, compute::take};
use crate::{
buffer::Buffer, util::display::array_value_to_string,
util::serialization::lexical_to_string,
buffer::Buffer,
util::display::array_value_to_string,
util::serialization::{float_lexical_to_string, lexical_to_string},
};
use num::{NumCast, ToPrimitive};

Expand Down Expand Up @@ -791,8 +792,8 @@ pub fn cast_with_options(
Int16 => cast_numeric_to_string::<Int16Type, i32>(array),
Int32 => cast_numeric_to_string::<Int32Type, i32>(array),
Int64 => cast_numeric_to_string::<Int64Type, i32>(array),
Float32 => cast_numeric_to_string::<Float32Type, i32>(array),
Float64 => cast_numeric_to_string::<Float64Type, i32>(array),
Float32 => cast_float_to_string::<Float32Type, i32>(array),
Float64 => cast_float_to_string::<Float64Type, i32>(array),
Timestamp(unit, _) => match unit {
TimeUnit::Nanosecond => {
cast_timestamp_to_string::<TimestampNanosecondType, i32>(array)
Expand Down Expand Up @@ -847,8 +848,8 @@ pub fn cast_with_options(
Int16 => cast_numeric_to_string::<Int16Type, i64>(array),
Int32 => cast_numeric_to_string::<Int32Type, i64>(array),
Int64 => cast_numeric_to_string::<Int64Type, i64>(array),
Float32 => cast_numeric_to_string::<Float32Type, i64>(array),
Float64 => cast_numeric_to_string::<Float64Type, i64>(array),
Float32 => cast_float_to_string::<Float32Type, i64>(array),
Float64 => cast_float_to_string::<Float64Type, i64>(array),
Timestamp(unit, _) => match unit {
TimeUnit::Nanosecond => {
cast_timestamp_to_string::<TimestampNanosecondType, i64>(array)
Expand Down Expand Up @@ -1541,6 +1542,36 @@ where
.collect()
}

/// Cast float types to Utf8
fn cast_float_to_string<FROM, OffsetSize>(array: &ArrayRef) -> Result<ArrayRef>
where
FROM: ArrowFloatNumericType,
FROM::Native:
lexical_core::ToLexicalWithOptions<Options = lexical_core::WriteFloatOptions>,
OffsetSize: StringOffsetSizeTrait,
{
Ok(Arc::new(float_to_string_cast::<FROM, OffsetSize>(
array
.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
)))
}

fn float_to_string_cast<T, OffsetSize>(
from: &PrimitiveArray<T>,
) -> GenericStringArray<OffsetSize>
where
T: ArrowPrimitiveType + ArrowFloatNumericType,
T::Native:
lexical_core::ToLexicalWithOptions<Options = lexical_core::WriteFloatOptions>,
OffsetSize: StringOffsetSizeTrait,
{
from.iter()
.map(|maybe_value| maybe_value.map(float_lexical_to_string))
.collect()
}

/// Cast numeric types to Utf8
fn cast_string_to_numeric<T, Offset: StringOffsetSizeTrait>(
from: &ArrayRef,
Expand Down
26 changes: 26 additions & 0 deletions arrow/src/util/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,29 @@ pub fn lexical_to_string<N: lexical_core::ToLexical>(n: N) -> String {
String::from_utf8_unchecked(buf)
}
}

/// Converts float type to a `String`
pub fn float_lexical_to_string<
N: lexical_core::ToLexicalWithOptions<Options = lexical_core::WriteFloatOptions>,
>(
n: N,
) -> String {
let mut buf = Vec::<u8>::with_capacity(N::FORMATTED_SIZE_DECIMAL);
unsafe {
// JUSTIFICATION
// Benefit
// Allows using the faster serializer lexical core and convert to string
// Soundness
// Length of buf is set as written length afterwards. lexical_core
// creates a valid string, so doesn't need to be checked.
let slice = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.capacity());
let options = lexical_core::WriteFloatOptions::builder()
.trim_floats(true)
.build()
.unwrap();
const FORMAT: u128 = lexical_core::format::POSTGRESQL;
let len = lexical_core::write_with_options::<_, FORMAT>(n, slice, &options).len();
buf.set_len(len);
String::from_utf8_unchecked(buf)
}
}

0 comments on commit 8a5e7e0

Please sign in to comment.