From 63f27961bd3c8a37805d796003a8b9bf63442f02 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 09:25:47 +0800 Subject: [PATCH 1/6] chore(query): add char/ord/soundex to function-v2 --- src/query/expression/src/function.rs | 54 +++++ src/query/expression/src/util.rs | 1 + src/query/expression/src/values.rs | 2 +- src/query/functions-v2/src/scalars/string.rs | 100 ++++++++++ .../src/scalars/string_multi_args.rs | 185 +++++++++++------- .../functions-v2/tests/it/scalars/string.rs | 42 ++++ .../tests/it/scalars/testdata/string.txt | 136 +++++++++++++ 7 files changed, 443 insertions(+), 77 deletions(-) diff --git a/src/query/expression/src/function.rs b/src/query/expression/src/function.rs index 026ce08944f1..df489570eb34 100755 --- a/src/query/expression/src/function.rs +++ b/src/query/expression/src/function.rs @@ -13,9 +13,11 @@ // limitations under the License. use std::collections::HashMap; +use std::ops::BitAnd; use std::sync::Arc; use chrono_tz::Tz; +use common_arrow::arrow::bitmap::MutableBitmap; use serde::Deserialize; use serde::Serialize; @@ -24,8 +26,11 @@ use crate::property::FunctionProperty; use crate::types::nullable::NullableColumn; use crate::types::nullable::NullableDomain; use crate::types::*; +use crate::util::constant_bitmap; use crate::values::Value; use crate::values::ValueRef; +use crate::Column; +use crate::Scalar; #[derive(Debug, Clone)] pub struct FunctionSignature { @@ -1042,3 +1047,52 @@ pub fn passthrough_nullable_3_arg( + f: F, +) -> impl Fn(&[ValueRef], &GenericMap) -> Result, String> + Copy +where F: Fn(&[ValueRef], &GenericMap) -> Result, String> + Copy { + move |args, generics| { + type T = NullableType; + type Result = AnyType; + let mut bitmap: Option = None; + let mut nonull_args: Vec> = Vec::with_capacity(args.len()); + + let mut len = 1; + for arg in args { + let arg = arg.try_downcast::().unwrap(); + match arg { + ValueRef::Scalar(None) => return Ok(Value::Scalar(Scalar::Null)), + ValueRef::Scalar(Some(s)) => { + nonull_args.push(ValueRef::Scalar(s.clone())); + } + ValueRef::Column(v) => { + len = v.len(); + nonull_args.push(ValueRef::Column(v.column.clone())); + bitmap = match bitmap { + Some(m) => Some(m.bitand(&v.validity)), + None => Some(v.validity.clone().make_mut()), + }; + } + } + } + let nonull_results = f(&nonull_args, generics)?; + let bitmap = bitmap.unwrap_or_else(|| constant_bitmap(true, len)); + match nonull_results { + Value::Scalar(s) => { + if bitmap.get(0) { + Ok(Value::Scalar(Result::upcast_scalar(s))) + } else { + Ok(Value::Scalar(Scalar::Null)) + } + } + Value::Column(column) => { + let result = Column::Nullable(Box::new(NullableColumn { + column, + validity: bitmap.into(), + })); + Ok(Value::Column(Result::upcast_column(result))) + } + } + } +} diff --git a/src/query/expression/src/util.rs b/src/query/expression/src/util.rs index ed04d54817ce..6c1b519c1e90 100644 --- a/src/query/expression/src/util.rs +++ b/src/query/expression/src/util.rs @@ -27,6 +27,7 @@ use common_arrow::arrow::io::ipc::write::FileWriter; use common_arrow::arrow::io::ipc::write::WriteOptions; use common_arrow::arrow::types::NativeType; + pub fn bitmap_into_mut(bitmap: Bitmap) -> MutableBitmap { bitmap .into_mut() diff --git a/src/query/expression/src/values.rs b/src/query/expression/src/values.rs index b3842b9f26c4..128957ed9576 100755 --- a/src/query/expression/src/values.rs +++ b/src/query/expression/src/values.rs @@ -212,7 +212,7 @@ impl Value { } impl<'a> ValueRef<'a, AnyType> { - pub fn try_downcast(&self) -> Option> { + pub fn try_downcast(&self) -> Option> { Some(match self { ValueRef::Scalar(scalar) => ValueRef::Scalar(T::try_downcast_scalar(scalar)?), ValueRef::Column(col) => ValueRef::Column(T::try_downcast_column(col)?), diff --git a/src/query/functions-v2/src/scalars/string.rs b/src/query/functions-v2/src/scalars/string.rs index a6bc482a5243..8c0fda15dbef 100644 --- a/src/query/functions-v2/src/scalars/string.rs +++ b/src/query/functions-v2/src/scalars/string.rs @@ -17,6 +17,7 @@ use std::io::Write; use bstr::ByteSlice; use common_expression::types::number::NumberDomain; +use common_expression::types::number::UInt64Type; use common_expression::types::string::StringColumn; use common_expression::types::string::StringColumnBuilder; use common_expression::types::GenericMap; @@ -534,6 +535,74 @@ pub fn register(registry: &mut FunctionRegistry) { }, ), ); + + registry.register_1_arg::( + "ord", + FunctionProperty::default(), + |_| None, + |str: &[u8]| { + let mut res: u64 = 0; + if !str.is_empty() { + if str[0].is_ascii() { + res = str[0] as u64; + } else { + for (p, _) in str.iter().enumerate() { + let s = &str[0..p + 1]; + if std::str::from_utf8(s).is_ok() { + for (i, b) in s.iter().rev().enumerate() { + res += (*b as u64) * 256_u64.pow(i as u32); + } + break; + } + } + } + } + res + }, + ); + + registry.register_passthrough_nullable_1_arg::( + "soundex", + FunctionProperty::default(), + |_| None, + vectorize_string_to_string( + |col| usize::max(col.data.len(), 4 * col.len()), + |val, writer| { + let mut last = None; + let mut count = 0; + + for ch in String::from_utf8_lossy(val).chars() { + let score = Soundex::number_map(ch); + if last.is_none() { + if !Soundex::is_uni_alphabetic(ch) { + continue; + } + last = score; + writer.put_char(ch.to_ascii_uppercase()); + } else { + if !ch.is_ascii_alphabetic() + || Soundex::is_drop(ch) + || score.is_none() + || score == last + { + continue; + } + last = score; + writer.put_char(score.unwrap() as char); + } + + count += 1; + } + // add '0' + for _ in count..4 { + writer.put_char('0'); + } + + writer.commit_row(); + Ok(()) + }, + ), + ); } // Vectorize string to string function with customer estimate_bytes. @@ -604,3 +673,34 @@ fn vectorize_string_to_string_2_arg( } } } + +struct Soundex; + +impl Soundex { + #[inline(always)] + fn number_map(i: char) -> Option { + match i.to_ascii_lowercase() { + 'b' | 'f' | 'p' | 'v' => Some(b'1'), + 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => Some(b'2'), + 'd' | 't' => Some(b'3'), + 'l' => Some(b'4'), + 'm' | 'n' => Some(b'5'), + 'r' => Some(b'6'), + _ => Some(b'0'), + } + } + + #[inline(always)] + fn is_drop(c: char) -> bool { + matches!( + c.to_ascii_lowercase(), + 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'h' | 'w' + ) + } + + // https://github.com/mysql/mysql-server/blob/3290a66c89eb1625a7058e0ef732432b6952b435/sql/item_strfunc.cc#L1919 + #[inline(always)] + fn is_uni_alphabetic(c: char) -> bool { + ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c as i32 >= 0xC0 + } +} diff --git a/src/query/functions-v2/src/scalars/string_multi_args.rs b/src/query/functions-v2/src/scalars/string_multi_args.rs index 4345e0c192f4..e64de9bf2950 100644 --- a/src/query/functions-v2/src/scalars/string_multi_args.rs +++ b/src/query/functions-v2/src/scalars/string_multi_args.rs @@ -12,19 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::ops::BitAnd; use std::sync::Arc; -use common_arrow::arrow::bitmap::MutableBitmap; -use common_expression::types::nullable::NullableColumn; +use common_expression::types::number::UInt64Type; +use common_expression::types::string::StringColumn; use common_expression::types::string::StringColumnBuilder; use common_expression::types::string::StringDomain; +use common_expression::types::AnyType; use common_expression::types::ArgType; use common_expression::types::DataType; +use common_expression::types::GenericMap; use common_expression::types::NullableType; use common_expression::types::StringType; use common_expression::types::ValueType; -use common_expression::util::constant_bitmap; +use common_expression::wrap_nullable; use common_expression::Column; use common_expression::Domain; use common_expression::Function; @@ -54,30 +55,7 @@ pub fn register(registry: &mut FunctionRegistry) { max: None, })) }), - eval: Box::new(|args, _generics| { - let len = args.iter().find_map(|arg| match arg { - ValueRef::Column(col) => Some(col.len()), - _ => None, - }); - let args = args - .iter() - .map(|arg| arg.try_downcast::().unwrap()) - .collect::>(); - - let size = len.unwrap_or(1); - let mut builder = StringColumnBuilder::with_capacity(size, 0); - for idx in 0..size { - for arg in &args { - unsafe { builder.put_slice(arg.index_unchecked(idx)) } - } - builder.commit_row(); - } - - match len { - Some(_) => Ok(Value::Column(Column::String(builder.build()))), - _ => Ok(Value::Scalar(Scalar::String(builder.build_scalar()))), - } - }), + eval: Box::new(concat_fn), })) }); @@ -94,54 +72,7 @@ pub fn register(registry: &mut FunctionRegistry) { property: FunctionProperty::default(), }, calc_domain: Box::new(|_, _| None), - eval: Box::new(|args, _generics| { - type T = NullableType; - let len = args.iter().find_map(|arg| match arg { - ValueRef::Column(col) => Some(col.len()), - _ => None, - }); - - let size = len.unwrap_or(1); - let mut bitmap: Option = None; - let mut inner_args: Vec> = Vec::with_capacity(args.len()); - for arg in args { - let col = arg.try_downcast::().unwrap(); - match col { - ValueRef::Scalar(None) => return Ok(Value::Scalar(T::upcast_scalar(None))), - ValueRef::Column(c) => { - bitmap = match bitmap { - Some(m) => Some(m.bitand(&c.validity)), - None => Some(c.validity.clone().make_mut()), - }; - inner_args.push(ValueRef::Column(c.column.clone())); - } - ValueRef::Scalar(Some(s)) => inner_args.push(ValueRef::Scalar(s)), - } - } - let mut builder = StringColumnBuilder::with_capacity(size, 0); - for idx in 0..size { - for arg in &inner_args { - unsafe { builder.put_slice(arg.index_unchecked(idx)) } - } - builder.commit_row(); - } - - match len { - Some(len) => { - let n = NullableColumn:: { - column: builder.build(), - validity: bitmap - .map(|m| m.into()) - .unwrap_or_else(|| constant_bitmap(true, len).into()), - }; - let c = T::upcast_column(n); - Ok(Value::Column(c)) - } - _ => Ok(Value::Scalar(T::upcast_scalar(Some( - builder.build_scalar(), - )))), - } - }), + eval: Box::new(wrap_nullable(concat_fn)), })) }); @@ -309,4 +240,106 @@ pub fn register(registry: &mut FunctionRegistry) { }), })) }); + + registry.register_function_factory("char", |_, args_type| { + if args_type.is_empty() { + return None; + } + Some(Arc::new(Function { + signature: FunctionSignature { + name: "char", + args_type: vec![DataType::UInt64; args_type.len()], + return_type: DataType::String, + property: FunctionProperty::default(), + }, + calc_domain: Box::new(|_, _| None), + eval: Box::new(char_fn), + })) + }); + + // nullable char + registry.register_function_factory("char", |_, args_type| { + if args_type.is_empty() { + return None; + } + Some(Arc::new(Function { + signature: FunctionSignature { + name: "char", + args_type: vec![DataType::Nullable(Box::new(DataType::UInt64)); args_type.len()], + return_type: DataType::Nullable(Box::new(DataType::String)), + property: FunctionProperty::default(), + }, + calc_domain: Box::new(|_, _| None), + eval: Box::new(wrap_nullable(char_fn)), + })) + }); +} + +fn concat_fn(args: &[ValueRef], _: &GenericMap) -> Result, String> { + let len = args.iter().find_map(|arg| match arg { + ValueRef::Column(col) => Some(col.len()), + _ => None, + }); + let args = args + .iter() + .map(|arg| arg.try_downcast::().unwrap()) + .collect::>(); + + let size = len.unwrap_or(1); + let mut builder = StringColumnBuilder::with_capacity(size, 0); + for idx in 0..size { + for arg in &args { + unsafe { builder.put_slice(arg.index_unchecked(idx)) } + } + builder.commit_row(); + } + + match len { + Some(_) => Ok(Value::Column(Column::String(builder.build()))), + _ => Ok(Value::Scalar(Scalar::String(builder.build_scalar()))), + } +} + +fn char_fn(args: &[ValueRef], _: &GenericMap) -> Result, String> { + let args = args + .iter() + .map(|arg| arg.try_downcast::().unwrap()) + .collect::>(); + + let len = args.iter().find_map(|arg| match arg { + ValueRef::Column(col) => Some(col.len()), + _ => None, + }); + let input_rows = len.unwrap_or(1); + + let mut values: Vec = vec![0; input_rows * args.len()]; + let values_ptr = values.as_mut_ptr(); + + for (i, arg) in args.iter().enumerate() { + match arg { + ValueRef::Scalar(v) => { + let v = *v as u8; + for j in 0..input_rows { + unsafe { + *values_ptr.add(args.len() * j + i) = v; + } + } + } + ValueRef::Column(c) => { + for (j, ch) in UInt64Type::iter_column(c).enumerate() { + unsafe { + *values_ptr.add(args.len() * j + i) = ch as u8; + } + } + } + } + } + let offsets = (0..(input_rows + 1) as u64 * args.len() as u64) + .step_by(args.len()) + .collect::>(); + let result = StringColumn { + data: values.into(), + offsets: offsets.into(), + }; + Ok(Value::Column(Column::String(result))) } diff --git a/src/query/functions-v2/tests/it/scalars/string.rs b/src/query/functions-v2/tests/it/scalars/string.rs index 0b875702c8b7..f5932f25de3d 100644 --- a/src/query/functions-v2/tests/it/scalars/string.rs +++ b/src/query/functions-v2/tests/it/scalars/string.rs @@ -51,6 +51,9 @@ fn test_string() { test_replace(file); test_strcmp(file); test_locate(file); + test_char(file); + test_soundex(file); + test_ord(file); } fn test_upper(file: &mut impl Write) { @@ -593,3 +596,42 @@ fn test_locate(file: &mut impl Write) { ]; run_ast(file, "locate(a, b, c)", &table); } + +fn test_char(file: &mut impl Write) { + run_ast(file, "char(65,66,67)", &[]); + run_ast(file, "char(11111, null)", &[]); + + let table = [ + ("a", DataType::UInt8, Column::from_data(vec![66, 67])), + ("b", DataType::UInt16, Column::from_data(vec![98, 99])), + ("c", DataType::UInt16, Column::from_data(vec![68, 69])), + ( + "a2", + DataType::Nullable(Box::new(DataType::UInt8)), + Column::from_data_with_validity(vec![66, 67], vec![true, false]), + ), + ]; + run_ast(file, "char(a, b, c)", &table); + run_ast(file, "char(a2, b, c)", &table); +} + +fn test_soundex(file: &mut impl Write) { + run_ast(file, "soundex('你好中国北京')", &[]); + run_ast(file, "soundex('')", &[]); + run_ast(file, "soundex('hello all folks')", &[]); + run_ast(file, "soundex('#3556 in bugdb')", &[]); + + let table = [( + "a", + DataType::String, + Column::from_data(&["#🐑🐑he🐑llo🐑", "🐑he🐑llo🐑", "teacher", "TEACHER"]), + )]; + run_ast(file, "soundex(a)", &table); +} + +fn test_ord(file: &mut impl Write) { + run_ast(file, "ord(NULL)", &[]); + run_ast(file, "ord('и')", &[]); + run_ast(file, "ord('早ab')", &[]); + run_ast(file, "ord('💖')", &[]); +} diff --git a/src/query/functions-v2/tests/it/scalars/testdata/string.txt b/src/query/functions-v2/tests/it/scalars/testdata/string.txt index 9efc18da3ac6..52b34eefa2ec 100644 --- a/src/query/functions-v2/tests/it/scalars/testdata/string.txt +++ b/src/query/functions-v2/tests/it/scalars/testdata/string.txt @@ -2538,3 +2538,139 @@ evaluation (internal): +--------+------------------------------------------------------------------------------------------------------------------------------------------------------+ +ast : char(65,66,67) +raw expr : char(65_u8, 66_u8, 67_u8) +checked expr : char(CAST(65_u8 AS UInt64), CAST(66_u8 AS UInt64), CAST(67_u8 AS UInt64)) +optimized expr : char(65_u64, 66_u64, 67_u64) +evaluation: ++--------+---------+ +| | Output | ++--------+---------+ +| Type | String | +| Domain | Unknown | +| Row 0 | "ABC" | ++--------+---------+ +evaluation (internal): ++--------+--------------------------------------------------------------+ +| Column | Data | ++--------+--------------------------------------------------------------+ +| Output | String(StringColumn { data: [65, 66, 67], offsets: [0, 3] }) | ++--------+--------------------------------------------------------------+ + + +ast : char(11111, null) +raw expr : char(11111_u16, NULL) +checked expr : char(CAST(11111_u16 AS UInt64 NULL), CAST(NULL AS UInt64 NULL)) +optimized expr : NULL +output type : String NULL +output domain : Unknown +output : NULL + + +ast : char(a, b, c) +raw expr : char(ColumnRef(0)::UInt8, ColumnRef(1)::UInt16, ColumnRef(2)::UInt16) +checked expr : char(CAST(ColumnRef(0) AS UInt64), CAST(ColumnRef(1) AS UInt64), CAST(ColumnRef(2) AS UInt64)) +evaluation: ++--------+-----------+-----------+-----------+--------------------+---------+ +| | a | b | c | a2 | Output | ++--------+-----------+-----------+-----------+--------------------+---------+ +| Type | UInt8 | UInt16 | UInt16 | UInt8 NULL | String | +| Domain | {66..=67} | {98..=99} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | +| Row 0 | 66 | 98 | 68 | 66 | "BbD" | +| Row 1 | 67 | 99 | 69 | NULL | "CcE" | ++--------+-----------+-----------+-----------+--------------------+---------+ +evaluation (internal): ++--------+------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------+ +| a | Int32([66, 67]) | +| b | Int32([98, 99]) | +| c | Int32([68, 69]) | +| a2 | Nullable(NullableColumn { column: Int32([66, 67]), validity: [0b______01] }) | +| Output | String(StringColumn { data: [66, 98, 68, 67, 99, 69], offsets: [0, 3, 6] }) | ++--------+------------------------------------------------------------------------------+ + + +ast : char(a2, b, c) +raw expr : char(ColumnRef(3)::UInt8 NULL, ColumnRef(1)::UInt16, ColumnRef(2)::UInt16) +checked expr : char(CAST(ColumnRef(3) AS UInt64 NULL), CAST(ColumnRef(1) AS UInt64 NULL), CAST(ColumnRef(2) AS UInt64 NULL)) +evaluation: ++--------+-----------+-----------+-----------+--------------------+-------------+ +| | a | b | c | a2 | Output | ++--------+-----------+-----------+-----------+--------------------+-------------+ +| Type | UInt8 | UInt16 | UInt16 | UInt8 NULL | String NULL | +| Domain | {66..=67} | {98..=99} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | +| Row 0 | 66 | 98 | 68 | 66 | "BbD" | +| Row 1 | 67 | 99 | 69 | NULL | NULL | ++--------+-----------+-----------+-----------+--------------------+-------------+ +evaluation (internal): ++--------+------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------------------------------------------------------------------+ +| a | Int32([66, 67]) | +| b | Int32([98, 99]) | +| c | Int32([68, 69]) | +| a2 | Nullable(NullableColumn { column: Int32([66, 67]), validity: [0b______01] }) | +| Output | Nullable(NullableColumn { column: String(StringColumn { data: [66, 98, 68, 67, 99, 69], offsets: [0, 3, 6] }), validity: [0b______01] }) | ++--------+------------------------------------------------------------------------------------------------------------------------------------------+ + + +ast : soundex('你好中国北京') +raw expr : soundex("你好中国北京") +checked expr : soundex("你好中国北京") +optimized expr : "你000" +output type : String +output domain : Unknown +output : "你000" + + +ast : soundex('') +raw expr : soundex("") +checked expr : soundex("") +optimized expr : "0000" +output type : String +output domain : Unknown +output : "0000" + + +ast : soundex('hello all folks') +raw expr : soundex("hello all folks") +checked expr : soundex("hello all folks") +optimized expr : "H4142" +output type : String +output domain : Unknown +output : "H4142" + + +ast : soundex('#3556 in bugdb') +raw expr : soundex("#3556 in bugdb") +checked expr : soundex("#3556 in bugdb") +optimized expr : "I51231" +output type : String +output domain : Unknown +output : "I51231" + + +ast : soundex(a) +raw expr : soundex(ColumnRef(0)::String) +checked expr : soundex(ColumnRef(0)) +evaluation: ++--------+------------------------------------+---------+ +| | a | Output | ++--------+------------------------------------+---------+ +| Type | String | String | +| Domain | {"#🐑🐑he🐑llo🐑"..="🐑he🐑llo🐑"} | Unknown | +| Row 0 | "#🐑🐑he🐑llo🐑" | "🐑400" | +| Row 1 | "🐑he🐑llo🐑" | "🐑400" | +| Row 2 | "teacher" | "T260" | +| Row 3 | "TEACHER" | "T260" | ++--------+------------------------------------+---------+ +evaluation (internal): ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| a | String(StringColumn { data: [35, 240, 159, 144, 145, 240, 159, 144, 145, 104, 101, 240, 159, 144, 145, 108, 108, 111, 240, 159, 144, 145, 240, 159, 144, 145, 104, 101, 240, 159, 144, 145, 108, 108, 111, 240, 159, 144, 145, 116, 101, 97, 99, 104, 101, 114, 84, 69, 65, 67, 72, 69, 82], offsets: [0, 22, 39, 46, 53] }) | +| Output | String(StringColumn { data: [240, 159, 144, 145, 52, 48, 48, 240, 159, 144, 145, 52, 48, 48, 84, 50, 54, 48, 84, 50, 54, 48], offsets: [0, 7, 14, 18, 22] }) | ++--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + From d7fb5b152893e8dd0b55e1e75552583b9d8aacdb Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 09:38:26 +0800 Subject: [PATCH 2/6] chore(query): add char/ord/soundex to function-v2 --- src/query/expression/src/util.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/query/expression/src/util.rs b/src/query/expression/src/util.rs index 6c1b519c1e90..ed04d54817ce 100644 --- a/src/query/expression/src/util.rs +++ b/src/query/expression/src/util.rs @@ -27,7 +27,6 @@ use common_arrow::arrow::io::ipc::write::FileWriter; use common_arrow::arrow::io::ipc::write::WriteOptions; use common_arrow::arrow::types::NativeType; - pub fn bitmap_into_mut(bitmap: Bitmap) -> MutableBitmap { bitmap .into_mut() From 2b64ccb7660f154f07fee644a9bdab98307be279 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 10:16:38 +0800 Subject: [PATCH 3/6] chore(query): add char/ord/soundex to function-v2 --- .../tests/it/scalars/testdata/string.txt | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/query/functions-v2/tests/it/scalars/testdata/string.txt b/src/query/functions-v2/tests/it/scalars/testdata/string.txt index 52b34eefa2ec..9ffa3c784364 100644 --- a/src/query/functions-v2/tests/it/scalars/testdata/string.txt +++ b/src/query/functions-v2/tests/it/scalars/testdata/string.txt @@ -2674,3 +2674,39 @@ evaluation (internal): +--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +ast : ord(NULL) +raw expr : ord(NULL) +checked expr : ord(NULL) +optimized expr : NULL +output type : NULL +output domain : {NULL} +output : NULL + + +ast : ord('и') +raw expr : ord("и") +checked expr : ord("и") +optimized expr : 53432_u64 +output type : UInt64 +output domain : Unknown +output : 53432 + + +ast : ord('早ab') +raw expr : ord("早ab") +checked expr : ord("早ab") +optimized expr : 15112105_u64 +output type : UInt64 +output domain : Unknown +output : 15112105 + + +ast : ord('💖') +raw expr : ord("💖") +checked expr : ord("💖") +optimized expr : 4036989590_u64 +output type : UInt64 +output domain : Unknown +output : 4036989590 + + From 9c6385697e28bd8c22e619c6e2be946590c851fb Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 15:15:14 +0800 Subject: [PATCH 4/6] chore(query): make char function only accept u8 type --- .../src/scalars/string_multi_args.rs | 15 ++-- .../functions-v2/tests/it/scalars/string.rs | 12 +-- .../tests/it/scalars/testdata/string.txt | 79 +++++++++++-------- .../src/scalars/conditionals/in_basic.rs | 6 +- .../02_0045_conditional_function_in | 6 +- 5 files changed, 66 insertions(+), 52 deletions(-) diff --git a/src/query/functions-v2/src/scalars/string_multi_args.rs b/src/query/functions-v2/src/scalars/string_multi_args.rs index e64de9bf2950..ce022a4349ab 100644 --- a/src/query/functions-v2/src/scalars/string_multi_args.rs +++ b/src/query/functions-v2/src/scalars/string_multi_args.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use common_expression::types::number::UInt64Type; +use common_expression::types::number::UInt8Type; use common_expression::types::string::StringColumn; use common_expression::types::string::StringColumnBuilder; use common_expression::types::string::StringDomain; @@ -248,7 +248,7 @@ pub fn register(registry: &mut FunctionRegistry) { Some(Arc::new(Function { signature: FunctionSignature { name: "char", - args_type: vec![DataType::UInt64; args_type.len()], + args_type: vec![DataType::UInt8; args_type.len()], return_type: DataType::String, property: FunctionProperty::default(), }, @@ -265,7 +265,7 @@ pub fn register(registry: &mut FunctionRegistry) { Some(Arc::new(Function { signature: FunctionSignature { name: "char", - args_type: vec![DataType::Nullable(Box::new(DataType::UInt64)); args_type.len()], + args_type: vec![DataType::Nullable(Box::new(DataType::UInt8)); args_type.len()], return_type: DataType::Nullable(Box::new(DataType::String)), property: FunctionProperty::default(), }, @@ -303,7 +303,7 @@ fn concat_fn(args: &[ValueRef], _: &GenericMap) -> Result], _: &GenericMap) -> Result, String> { let args = args .iter() - .map(|arg| arg.try_downcast::().unwrap()) + .map(|arg| arg.try_downcast::().unwrap()) .collect::>(); let len = args.iter().find_map(|arg| match arg { @@ -318,17 +318,16 @@ fn char_fn(args: &[ValueRef], _: &GenericMap) -> Result, for (i, arg) in args.iter().enumerate() { match arg { ValueRef::Scalar(v) => { - let v = *v as u8; for j in 0..input_rows { unsafe { - *values_ptr.add(args.len() * j + i) = v; + *values_ptr.add(args.len() * j + i) = *v; } } } ValueRef::Column(c) => { - for (j, ch) in UInt64Type::iter_column(c).enumerate() { + for (j, ch) in UInt8Type::iter_column(c).enumerate() { unsafe { - *values_ptr.add(args.len() * j + i) = ch as u8; + *values_ptr.add(args.len() * j + i) = ch; } } } diff --git a/src/query/functions-v2/tests/it/scalars/string.rs b/src/query/functions-v2/tests/it/scalars/string.rs index f5932f25de3d..f0305d25b90b 100644 --- a/src/query/functions-v2/tests/it/scalars/string.rs +++ b/src/query/functions-v2/tests/it/scalars/string.rs @@ -599,20 +599,22 @@ fn test_locate(file: &mut impl Write) { fn test_char(file: &mut impl Write) { run_ast(file, "char(65,66,67)", &[]); - run_ast(file, "char(11111, null)", &[]); + run_ast(file, "char(65, null)", &[]); let table = [ - ("a", DataType::UInt8, Column::from_data(vec![66, 67])), - ("b", DataType::UInt16, Column::from_data(vec![98, 99])), - ("c", DataType::UInt16, Column::from_data(vec![68, 69])), + ("a", DataType::UInt8, Column::from_data(vec![66u8, 67])), + ("b", DataType::UInt8, Column::from_data(vec![98u8, 99])), + ("c", DataType::UInt8, Column::from_data(vec![68u8, 69])), + ("c2", DataType::UInt16, Column::from_data(vec![68u16, 69])), ( "a2", DataType::Nullable(Box::new(DataType::UInt8)), - Column::from_data_with_validity(vec![66, 67], vec![true, false]), + Column::from_data_with_validity(vec![66u8, 67], vec![true, false]), ), ]; run_ast(file, "char(a, b, c)", &table); run_ast(file, "char(a2, b, c)", &table); + run_ast(file, "char(c2)", &table); } fn test_soundex(file: &mut impl Write) { diff --git a/src/query/functions-v2/tests/it/scalars/testdata/string.txt b/src/query/functions-v2/tests/it/scalars/testdata/string.txt index 9ffa3c784364..ed6e66559421 100644 --- a/src/query/functions-v2/tests/it/scalars/testdata/string.txt +++ b/src/query/functions-v2/tests/it/scalars/testdata/string.txt @@ -2540,8 +2540,7 @@ evaluation (internal): ast : char(65,66,67) raw expr : char(65_u8, 66_u8, 67_u8) -checked expr : char(CAST(65_u8 AS UInt64), CAST(66_u8 AS UInt64), CAST(67_u8 AS UInt64)) -optimized expr : char(65_u64, 66_u64, 67_u64) +checked expr : char(65_u8, 66_u8, 67_u8) evaluation: +--------+---------+ | | Output | @@ -2558,9 +2557,9 @@ evaluation (internal): +--------+--------------------------------------------------------------+ -ast : char(11111, null) -raw expr : char(11111_u16, NULL) -checked expr : char(CAST(11111_u16 AS UInt64 NULL), CAST(NULL AS UInt64 NULL)) +ast : char(65, null) +raw expr : char(65_u8, NULL) +checked expr : char(CAST(65_u8 AS UInt8 NULL), CAST(NULL AS UInt8 NULL)) optimized expr : NULL output type : String NULL output domain : Unknown @@ -2568,53 +2567,67 @@ output : NULL ast : char(a, b, c) -raw expr : char(ColumnRef(0)::UInt8, ColumnRef(1)::UInt16, ColumnRef(2)::UInt16) -checked expr : char(CAST(ColumnRef(0) AS UInt64), CAST(ColumnRef(1) AS UInt64), CAST(ColumnRef(2) AS UInt64)) +raw expr : char(ColumnRef(0)::UInt8, ColumnRef(1)::UInt8, ColumnRef(2)::UInt8) +checked expr : char(ColumnRef(0), ColumnRef(1), ColumnRef(2)) evaluation: -+--------+-----------+-----------+-----------+--------------------+---------+ -| | a | b | c | a2 | Output | -+--------+-----------+-----------+-----------+--------------------+---------+ -| Type | UInt8 | UInt16 | UInt16 | UInt8 NULL | String | -| Domain | {66..=67} | {98..=99} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | -| Row 0 | 66 | 98 | 68 | 66 | "BbD" | -| Row 1 | 67 | 99 | 69 | NULL | "CcE" | -+--------+-----------+-----------+-----------+--------------------+---------+ ++--------+-----------+-----------+-----------+-----------+--------------------+---------+ +| | a | b | c | c2 | a2 | Output | ++--------+-----------+-----------+-----------+-----------+--------------------+---------+ +| Type | UInt8 | UInt8 | UInt8 | UInt16 | UInt8 NULL | String | +| Domain | {66..=67} | {98..=99} | {68..=69} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | +| Row 0 | 66 | 98 | 68 | 68 | 66 | "BbD" | +| Row 1 | 67 | 99 | 69 | 69 | NULL | "CcE" | ++--------+-----------+-----------+-----------+-----------+--------------------+---------+ evaluation (internal): +--------+------------------------------------------------------------------------------+ | Column | Data | +--------+------------------------------------------------------------------------------+ -| a | Int32([66, 67]) | -| b | Int32([98, 99]) | -| c | Int32([68, 69]) | -| a2 | Nullable(NullableColumn { column: Int32([66, 67]), validity: [0b______01] }) | +| a | UInt8([66, 67]) | +| b | UInt8([98, 99]) | +| c | UInt8([68, 69]) | +| c2 | UInt16([68, 69]) | +| a2 | Nullable(NullableColumn { column: UInt8([66, 67]), validity: [0b______01] }) | | Output | String(StringColumn { data: [66, 98, 68, 67, 99, 69], offsets: [0, 3, 6] }) | +--------+------------------------------------------------------------------------------+ ast : char(a2, b, c) -raw expr : char(ColumnRef(3)::UInt8 NULL, ColumnRef(1)::UInt16, ColumnRef(2)::UInt16) -checked expr : char(CAST(ColumnRef(3) AS UInt64 NULL), CAST(ColumnRef(1) AS UInt64 NULL), CAST(ColumnRef(2) AS UInt64 NULL)) +raw expr : char(ColumnRef(4)::UInt8 NULL, ColumnRef(1)::UInt8, ColumnRef(2)::UInt8) +checked expr : char(ColumnRef(4), CAST(ColumnRef(1) AS UInt8 NULL), CAST(ColumnRef(2) AS UInt8 NULL)) evaluation: -+--------+-----------+-----------+-----------+--------------------+-------------+ -| | a | b | c | a2 | Output | -+--------+-----------+-----------+-----------+--------------------+-------------+ -| Type | UInt8 | UInt16 | UInt16 | UInt8 NULL | String NULL | -| Domain | {66..=67} | {98..=99} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | -| Row 0 | 66 | 98 | 68 | 66 | "BbD" | -| Row 1 | 67 | 99 | 69 | NULL | NULL | -+--------+-----------+-----------+-----------+--------------------+-------------+ ++--------+-----------+-----------+-----------+-----------+--------------------+-------------+ +| | a | b | c | c2 | a2 | Output | ++--------+-----------+-----------+-----------+-----------+--------------------+-------------+ +| Type | UInt8 | UInt8 | UInt8 | UInt16 | UInt8 NULL | String NULL | +| Domain | {66..=67} | {98..=99} | {68..=69} | {68..=69} | {66..=67} ∪ {NULL} | Unknown | +| Row 0 | 66 | 98 | 68 | 68 | 66 | "BbD" | +| Row 1 | 67 | 99 | 69 | 69 | NULL | NULL | ++--------+-----------+-----------+-----------+-----------+--------------------+-------------+ evaluation (internal): +--------+------------------------------------------------------------------------------------------------------------------------------------------+ | Column | Data | +--------+------------------------------------------------------------------------------------------------------------------------------------------+ -| a | Int32([66, 67]) | -| b | Int32([98, 99]) | -| c | Int32([68, 69]) | -| a2 | Nullable(NullableColumn { column: Int32([66, 67]), validity: [0b______01] }) | +| a | UInt8([66, 67]) | +| b | UInt8([98, 99]) | +| c | UInt8([68, 69]) | +| c2 | UInt16([68, 69]) | +| a2 | Nullable(NullableColumn { column: UInt8([66, 67]), validity: [0b______01] }) | | Output | Nullable(NullableColumn { column: String(StringColumn { data: [66, 98, 68, 67, 99, 69], offsets: [0, 3, 6] }), validity: [0b______01] }) | +--------+------------------------------------------------------------------------------------------------------------------------------------------+ +error: + --> SQL:1:1 + | +1 | char(c2) + | ^^^^^^^^ no overload satisfies `char(UInt16)` + +has tried possible overloads: + char(UInt8) :: String : unable to unify `UInt16` with `UInt8` + char(UInt8 NULL) :: String NULL : unable to unify `UInt16` with `UInt8` + + + ast : soundex('你好中国北京') raw expr : soundex("你好中国北京") checked expr : soundex("你好中国北京") diff --git a/src/query/functions/src/scalars/conditionals/in_basic.rs b/src/query/functions/src/scalars/conditionals/in_basic.rs index 4e8e9b5cf7d8..710b37b75e1a 100644 --- a/src/query/functions/src/scalars/conditionals/in_basic.rs +++ b/src/query/functions/src/scalars/conditionals/in_basic.rs @@ -124,9 +124,9 @@ impl Function for InFunction { return Ok(col); } - let null_flag = columns[1..] - .iter() - .any(|column| column.field().data_type().is_null()); + let null_flag = columns[1..].iter().any(|column| { + column.field().data_type().is_null() || column.field().data_type().is_nullable() + }); let mut least_super_dt = columns[0].field().data_type().clone(); let mut nonull_least_super_dt = remove_nullable(&least_super_dt); diff --git a/tests/logictest/suites/base/02_function/02_0045_conditional_function_in b/tests/logictest/suites/base/02_function/02_0045_conditional_function_in index 79d27a70270b..c2f6e410d033 100644 --- a/tests/logictest/suites/base/02_function/02_0045_conditional_function_in +++ b/tests/logictest/suites/base/02_function/02_0045_conditional_function_in @@ -33,11 +33,11 @@ SELECT NULL IN (1, 2, 3); ---- NULL -statement query B -SELECT 1 IN (1, 2, NULL); +statement query BB +SELECT 1 IN (1, 2, NULL), (true IN (false, (NULL NOT BETWEEN NULL AND NULL))); ---- -1 +1 0 statement query BB From 5c22f91a06cdced00788806c11f954e32a7b2447 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 15:16:16 +0800 Subject: [PATCH 5/6] chore(query): make char function only accept u8 type --- src/query/expression/src/function.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/query/expression/src/function.rs b/src/query/expression/src/function.rs index df489570eb34..45eae2a7a078 100755 --- a/src/query/expression/src/function.rs +++ b/src/query/expression/src/function.rs @@ -1055,8 +1055,9 @@ where F: Fn(&[ValueRef], &GenericMap) -> Result, String> move |args, generics| { type T = NullableType; type Result = AnyType; + let mut bitmap: Option = None; - let mut nonull_args: Vec> = Vec::with_capacity(args.len()); + let mut nonull_args: Vec> = Vec::with_capacity(args.len()); let mut len = 1; for arg in args { From ce55f930b5949d12399a016b44d5a09a016a4b02 Mon Sep 17 00:00:00 2001 From: sundy-li <543950155@qq.com> Date: Fri, 9 Sep 2022 18:32:34 +0800 Subject: [PATCH 6/6] chore(query): make soundex a mod --- src/query/functions-v2/src/scalars/mod.rs | 1 + src/query/functions-v2/src/scalars/soundex.rs | 44 +++++++++++++++++++ src/query/functions-v2/src/scalars/string.rs | 33 +------------- 3 files changed, 47 insertions(+), 31 deletions(-) create mode 100644 src/query/functions-v2/src/scalars/soundex.rs diff --git a/src/query/functions-v2/src/scalars/mod.rs b/src/query/functions-v2/src/scalars/mod.rs index e17aae024dd4..54a2d7848740 100644 --- a/src/query/functions-v2/src/scalars/mod.rs +++ b/src/query/functions-v2/src/scalars/mod.rs @@ -22,6 +22,7 @@ mod boolean; mod control; mod datetime; mod math; +mod soundex; mod string; mod string_multi_args; diff --git a/src/query/functions-v2/src/scalars/soundex.rs b/src/query/functions-v2/src/scalars/soundex.rs new file mode 100644 index 000000000000..94c8df4f75d6 --- /dev/null +++ b/src/query/functions-v2/src/scalars/soundex.rs @@ -0,0 +1,44 @@ +// Copyright 2021 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) struct Soundex; + +impl Soundex { + #[inline(always)] + pub fn number_map(i: char) -> Option { + match i.to_ascii_lowercase() { + 'b' | 'f' | 'p' | 'v' => Some(b'1'), + 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => Some(b'2'), + 'd' | 't' => Some(b'3'), + 'l' => Some(b'4'), + 'm' | 'n' => Some(b'5'), + 'r' => Some(b'6'), + _ => Some(b'0'), + } + } + + #[inline(always)] + pub fn is_drop(c: char) -> bool { + matches!( + c.to_ascii_lowercase(), + 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'h' | 'w' + ) + } + + // https://github.com/mysql/mysql-server/blob/3290a66c89eb1625a7058e0ef732432b6952b435/sql/item_strfunc.cc#L1919 + #[inline(always)] + pub fn is_uni_alphabetic(c: char) -> bool { + ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c as i32 >= 0xC0 + } +} diff --git a/src/query/functions-v2/src/scalars/string.rs b/src/query/functions-v2/src/scalars/string.rs index 8c0fda15dbef..034823fa3e35 100644 --- a/src/query/functions-v2/src/scalars/string.rs +++ b/src/query/functions-v2/src/scalars/string.rs @@ -30,6 +30,8 @@ use common_expression::Value; use common_expression::ValueRef; use itertools::izip; +use super::soundex::Soundex; + pub fn register(registry: &mut FunctionRegistry) { registry.register_passthrough_nullable_1_arg::( "upper", @@ -673,34 +675,3 @@ fn vectorize_string_to_string_2_arg( } } } - -struct Soundex; - -impl Soundex { - #[inline(always)] - fn number_map(i: char) -> Option { - match i.to_ascii_lowercase() { - 'b' | 'f' | 'p' | 'v' => Some(b'1'), - 'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => Some(b'2'), - 'd' | 't' => Some(b'3'), - 'l' => Some(b'4'), - 'm' | 'n' => Some(b'5'), - 'r' => Some(b'6'), - _ => Some(b'0'), - } - } - - #[inline(always)] - fn is_drop(c: char) -> bool { - matches!( - c.to_ascii_lowercase(), - 'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'h' | 'w' - ) - } - - // https://github.com/mysql/mysql-server/blob/3290a66c89eb1625a7058e0ef732432b6952b435/sql/item_strfunc.cc#L1919 - #[inline(always)] - fn is_uni_alphabetic(c: char) -> bool { - ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c as i32 >= 0xC0 - } -}