From 57556b422cd761d07b0e07f65d272183d194050c Mon Sep 17 00:00:00 2001 From: Alex Qyoun-ae <4062971+MazterQyou@users.noreply.github.com> Date: Thu, 23 Oct 2025 23:52:45 +0400 Subject: [PATCH] feat: Coerce strings in binary expressions Signed-off-by: Alex Qyoun-ae <4062971+MazterQyou@users.noreply.github.com> --- Cargo.lock | 4 +- datafusion-cli/Cargo.lock | 4 +- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/common/Cargo.toml | 4 +- datafusion/core/Cargo.toml | 4 +- datafusion/core/fuzz-utils/Cargo.toml | 2 +- .../core/src/physical_optimizer/pruning.rs | 19 ++++---- datafusion/core/src/physical_plan/planner.rs | 3 +- datafusion/core/tests/sql/expr.rs | 3 +- datafusion/cube_ext/Cargo.toml | 2 +- datafusion/expr/Cargo.toml | 2 +- datafusion/expr/src/binary_rule.rs | 44 ++++++++++++++----- datafusion/jit/Cargo.toml | 2 +- datafusion/physical-expr/Cargo.toml | 2 +- .../physical-expr/src/expressions/try_cast.rs | 7 ++- 16 files changed, 66 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 682eddb4f29f..12e02fbfdc99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,7 +79,7 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" version = "13.0.0" -source = "git+https://github.com/cube-js/arrow-rs.git?rev=a03d4eef5640e05dddf99fc2357ad6d58b5337cb#a03d4eef5640e05dddf99fc2357ad6d58b5337cb" +source = "git+https://github.com/cube-js/arrow-rs.git?rev=8277f9a051c90482ff9557a91c5dc3926d9ba19e#8277f9a051c90482ff9557a91c5dc3926d9ba19e" dependencies = [ "bitflags", "chrono", @@ -1502,7 +1502,7 @@ dependencies = [ [[package]] name = "parquet" version = "13.0.0" -source = "git+https://github.com/cube-js/arrow-rs.git?rev=a03d4eef5640e05dddf99fc2357ad6d58b5337cb#a03d4eef5640e05dddf99fc2357ad6d58b5337cb" +source = "git+https://github.com/cube-js/arrow-rs.git?rev=8277f9a051c90482ff9557a91c5dc3926d9ba19e#8277f9a051c90482ff9557a91c5dc3926d9ba19e" dependencies = [ "arrow", "base64", diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ce9b1a91430b..eabc192d8ad2 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -67,7 +67,7 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" version = "13.0.0" -source = "git+https://github.com/cube-js/arrow-rs.git?rev=a03d4eef5640e05dddf99fc2357ad6d58b5337cb#a03d4eef5640e05dddf99fc2357ad6d58b5337cb" +source = "git+https://github.com/cube-js/arrow-rs.git?rev=8277f9a051c90482ff9557a91c5dc3926d9ba19e#8277f9a051c90482ff9557a91c5dc3926d9ba19e" dependencies = [ "bitflags", "chrono", @@ -1229,7 +1229,7 @@ dependencies = [ [[package]] name = "parquet" version = "13.0.0" -source = "git+https://github.com/cube-js/arrow-rs.git?rev=a03d4eef5640e05dddf99fc2357ad6d58b5337cb#a03d4eef5640e05dddf99fc2357ad6d58b5337cb" +source = "git+https://github.com/cube-js/arrow-rs.git?rev=8277f9a051c90482ff9557a91c5dc3926d9ba19e#8277f9a051c90482ff9557a91c5dc3926d9ba19e" dependencies = [ "arrow", "base64", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index fdf9ed411aee..bbd804b76055 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -28,7 +28,7 @@ repository = "https://github.com/apache/arrow-datafusion" rust-version = "1.59" [dependencies] -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb" } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e" } clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "7.0.0", features = ["parquet-all-compressions"] } dirs = "4.0.0" diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 9b250118e3ac..994145e9dc47 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -34,7 +34,7 @@ path = "examples/avro_sql.rs" required-features = ["datafusion/avro"] [dev-dependencies] -arrow-flight = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb" } +arrow-flight = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e" } async-trait = "0.1.41" datafusion = { path = "../datafusion/core" } futures = "0.3" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 80a95f151e54..4e27442a338a 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -40,10 +40,10 @@ pyarrow = ["pyo3"] parquet-all-compressions = ["parquet/snap", "parquet/brotli", "parquet/flate2", "parquet/lz4", "parquet/zstd"] [dependencies] -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } avro-rs = { version = "0.13", features = ["snappy"], optional = true } cranelift-module = { version = "0.82.0", optional = true } ordered-float = "2.10" -parquet = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["arrow", "base64"], default-features = false, optional = true } +parquet = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["arrow", "base64"], default-features = false, optional = true } pyo3 = { version = "0.16", optional = true } sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "16f051486de78a23a0ff252155dd59fc2d35497d" } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 46b683597bb9..a90e6e497f56 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -57,7 +57,7 @@ parquet-all-compressions = ["datafusion-common/parquet-all-compressions", "parqu [dependencies] ahash = { version = "0.7", default-features = false } -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } async-trait = "0.1.41" avro-rs = { version = "0.13", features = ["snappy"], optional = true } chrono = { version = "0.4", default-features = false } @@ -76,7 +76,7 @@ num_cpus = "1.13.0" ordered-float = "2.10" parking_lot = "0.12" # All compression codes are disabled by default in our fork because it increases compilation time significantly. -parquet = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["arrow", "base64"], default-features = false } +parquet = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["arrow", "base64"], default-features = false } paste = "^1.0" pin-project-lite= "^0.2.7" pyo3 = { version = "0.16", optional = true } diff --git a/datafusion/core/fuzz-utils/Cargo.toml b/datafusion/core/fuzz-utils/Cargo.toml index 4f8ff47f14fa..4707de459a80 100644 --- a/datafusion/core/fuzz-utils/Cargo.toml +++ b/datafusion/core/fuzz-utils/Cargo.toml @@ -23,6 +23,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } env_logger = "0.9.0" rand = "0.8" diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index 8c9ad7c1b85e..1eb394477098 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -1055,17 +1055,14 @@ mod tests { num_containers: 1, }; - let batch = - build_statistics_record_batch(&statistics, &required_columns).unwrap(); - let expected = vec![ - "+--------+", - "| s1_min |", - "+--------+", - "| |", - "+--------+", - ]; - - assert_batches_eq!(expected, &[batch]); + // NOTE(cubesql): error is returned on invalid cast + let result = + build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); + assert!( + result.to_string().contains("Cannot cast binary to string"), + "{}", + result + ); } #[test] diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index d7cddea14999..98420a672404 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1929,7 +1929,8 @@ mod tests { // u32 AND bool col("c2").and(bool_expr), // utf8 LIKE u32 - col("c1").like(col("c2")), + // NOTE(cubesql): valid + //col("c1").like(col("c2")), ]; for case in cases { let logical_plan = LogicalPlanBuilder::scan_csv( diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index 8cd6b7c4cf9b..b604cdfce5e4 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -1057,7 +1057,8 @@ async fn test_cast_expressions() -> Result<()> { test_expression!("CAST('0' AS INT)", "0"); test_expression!("CAST(NULL AS INT)", "NULL"); test_expression!("TRY_CAST('0' AS INT)", "0"); - test_expression!("TRY_CAST('x' AS INT)", "NULL"); + // NOTE(cubesql): throws an error for this cast as PostgreSQL does + //test_expression!("TRY_CAST('x' AS INT)", "NULL"); Ok(()) } diff --git a/datafusion/cube_ext/Cargo.toml b/datafusion/cube_ext/Cargo.toml index 62bd734de763..0a7a4e1ce119 100644 --- a/datafusion/cube_ext/Cargo.toml +++ b/datafusion/cube_ext/Cargo.toml @@ -35,7 +35,7 @@ name = "cube_ext" path = "src/lib.rs" [dependencies] -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } chrono = { version = "0.4.16", package = "chrono", default-features = false, features = ["clock"] } datafusion-common = { path = "../common", version = "7.0.0" } datafusion-expr = { path = "../expr", version = "7.0.0" } diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index d2822ffb5825..ed4a2373d174 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -36,6 +36,6 @@ path = "src/lib.rs" [dependencies] ahash = { version = "0.7", default-features = false } -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } datafusion-common = { path = "../common", version = "7.0.0" } sqlparser = { git = 'https://github.com/cube-js/sqlparser-rs.git', rev = "16f051486de78a23a0ff252155dd59fc2d35497d" } diff --git a/datafusion/expr/src/binary_rule.rs b/datafusion/expr/src/binary_rule.rs index 5366c04707c2..914138a5a98e 100644 --- a/datafusion/expr/src/binary_rule.rs +++ b/datafusion/expr/src/binary_rule.rs @@ -151,9 +151,13 @@ pub fn coerce_types( fn bitwise_coercion(left_type: &DataType, right_type: &DataType) -> Option { use arrow::datatypes::DataType::*; - if !is_numeric(left_type) || !is_numeric(right_type) { - return None; + // If one of the sides is numeric and the other is a string, coercion to number is allowed + match (is_numeric(left_type), is_numeric(right_type)) { + (true, true) => (), + (false, false) => return None, + _ => return string_with_any_coercion(left_type, right_type), } + if left_type == right_type && !is_dictionary(left_type) { return Some(left_type.clone()); } @@ -242,9 +246,13 @@ fn comparison_binary_numeric_coercion( rhs_type: &DataType, ) -> Option { use arrow::datatypes::DataType::*; - if !is_numeric(lhs_type) || !is_numeric(rhs_type) { - return None; - }; + + // If one of the sides is numeric and the other is a string, coercion to number is allowed + match (is_numeric(lhs_type), is_numeric(lhs_type)) { + (true, true) => (), + (false, false) => return None, + _ => return string_with_any_coercion(lhs_type, rhs_type), + } // same type => all good if lhs_type == rhs_type { @@ -322,9 +330,12 @@ fn mathematics_numerical_coercion( use arrow::datatypes::DataType::*; // error on any non-numeric type - if !is_numeric(lhs_type) || !is_numeric(rhs_type) { - return None; - }; + // If one of the sides is numeric and the other is a string, coercion to number is allowed + match (is_numeric(lhs_type), is_numeric(rhs_type)) { + (true, true) => (), + (false, false) => return None, + _ => return string_with_any_coercion(lhs_type, rhs_type), + } // exponentiation is always Float64 if mathematics_op == &Operator::Exponentiate { @@ -557,6 +568,7 @@ fn string_boolean_equality_coercion( fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) .or_else(|| dictionary_coercion(lhs_type, rhs_type)) + .or_else(|| string_with_any_coercion(lhs_type, rhs_type)) } /// Coercion rules for Temporal columns: the type that both lhs and rhs can be @@ -668,6 +680,7 @@ fn eq_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { .or_else(|| dictionary_coercion(lhs_type, rhs_type)) .or_else(|| temporal_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) + .or_else(|| string_with_any_coercion(lhs_type, rhs_type)) } pub fn distinct_coercion( @@ -803,6 +816,17 @@ fn interval_add_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { + match (lhs_type, rhs_type) { + (DataType::Utf8, t) | (t, DataType::Utf8) => Some(t.clone()), + _ => None, + } +} + #[cfg(test)] mod tests { use super::*; @@ -815,10 +839,10 @@ mod tests { fn test_coercion_error() -> Result<()> { let result_type = - coerce_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8); + coerce_types(&DataType::Float32, &Operator::Plus, &DataType::Date32); if let Err(DataFusionError::Plan(e)) = result_type { - assert_eq!(e, "'Float32 + Utf8' can't be evaluated because there isn't a common type to coerce the types to"); + assert_eq!(e, "'Float32 + Date32' can't be evaluated because there isn't a common type to coerce the types to"); Ok(()) } else { Err(DataFusionError::Internal( diff --git a/datafusion/jit/Cargo.toml b/datafusion/jit/Cargo.toml index 2981154a7824..585e858f85a4 100644 --- a/datafusion/jit/Cargo.toml +++ b/datafusion/jit/Cargo.toml @@ -36,7 +36,7 @@ path = "src/lib.rs" jit = [] [dependencies] -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb" } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e" } cranelift = "0.82.0" cranelift-jit = "0.82.0" cranelift-module = "0.82.0" diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 31a2fb4631a3..51dbc02e6a9b 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -40,7 +40,7 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.7", default-features = false } -arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "a03d4eef5640e05dddf99fc2357ad6d58b5337cb", features = ["prettyprint"] } +arrow = { git = 'https://github.com/cube-js/arrow-rs.git', rev = "8277f9a051c90482ff9557a91c5dc3926d9ba19e", features = ["prettyprint"] } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { version = "0.4.20", default-features = false } diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index 6b0d3e1b1384..959378e7f360 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -526,13 +526,16 @@ mod tests { #[test] fn test_try_cast_utf8_i32() -> Result<()> { + // NOTE(cubesql): casting invalid Utf8 to Int32 throws an error like in PostgreSQL generic_test_cast!( StringArray, DataType::Utf8, - vec!["a", "2", "3", "b", "5"], + // vec!["a", "2", "3", "b", "5"], + vec!["2", "3", "5"], Int32Array, DataType::Int32, - vec![None, Some(2), Some(3), None, Some(5)] + // vec![None, Some(2), Some(3), None, Some(5)] + vec![Some(2), Some(3), Some(5)] ); Ok(()) }