Skip to content

Commit

Permalink
Merge pull request #7539 from sundy-li/fv2-string
Browse files Browse the repository at this point in the history
feat(query): add char/ord/soundex to function-v2
  • Loading branch information
mergify[bot] committed Sep 9, 2022
2 parents fa49f1c + b2fb291 commit 8202fed
Show file tree
Hide file tree
Showing 10 changed files with 515 additions and 83 deletions.
55 changes: 55 additions & 0 deletions src/query/expression/src/function.rs
Expand Up @@ -13,9 +13,11 @@
// limitations under the License.

use std::collections::HashMap;
use std::ops::BitAnd;
use std::sync::Arc;

use chrono_tz::Tz;
use common_arrow::arrow::bitmap::MutableBitmap;
use serde::Deserialize;
use serde::Serialize;

Expand All @@ -24,8 +26,11 @@ use crate::property::FunctionProperty;
use crate::types::nullable::NullableColumn;
use crate::types::nullable::NullableDomain;
use crate::types::*;
use crate::util::constant_bitmap;
use crate::values::Value;
use crate::values::ValueRef;
use crate::Column;
use crate::Scalar;

#[derive(Debug, Clone)]
pub struct FunctionSignature {
Expand Down Expand Up @@ -1042,3 +1047,53 @@ pub fn passthrough_nullable_3_arg<I1: ArgType, I2: ArgType, I3: ArgType, O: ArgT
}
}
}

pub fn wrap_nullable<F>(
f: F,
) -> impl Fn(&[ValueRef<AnyType>], &GenericMap) -> Result<Value<AnyType>, String> + Copy
where F: Fn(&[ValueRef<AnyType>], &GenericMap) -> Result<Value<AnyType>, String> + Copy {
move |args, generics| {
type T = NullableType<AnyType>;
type Result = AnyType;

let mut bitmap: Option<MutableBitmap> = None;
let mut nonull_args: Vec<ValueRef<Result>> = Vec::with_capacity(args.len());

let mut len = 1;
for arg in args {
let arg = arg.try_downcast::<T>().unwrap();
match arg {
ValueRef::Scalar(None) => return Ok(Value::Scalar(Scalar::Null)),
ValueRef::Scalar(Some(s)) => {
nonull_args.push(ValueRef::Scalar(s.clone()));
}
ValueRef::Column(v) => {
len = v.len();
nonull_args.push(ValueRef::Column(v.column.clone()));
bitmap = match bitmap {
Some(m) => Some(m.bitand(&v.validity)),
None => Some(v.validity.clone().make_mut()),
};
}
}
}
let nonull_results = f(&nonull_args, generics)?;
let bitmap = bitmap.unwrap_or_else(|| constant_bitmap(true, len));
match nonull_results {
Value::Scalar(s) => {
if bitmap.get(0) {
Ok(Value::Scalar(Result::upcast_scalar(s)))
} else {
Ok(Value::Scalar(Scalar::Null))
}
}
Value::Column(column) => {
let result = Column::Nullable(Box::new(NullableColumn {
column,
validity: bitmap.into(),
}));
Ok(Value::Column(Result::upcast_column(result)))
}
}
}
}
2 changes: 1 addition & 1 deletion src/query/expression/src/values.rs
Expand Up @@ -212,7 +212,7 @@ impl<T: ArgType> Value<T> {
}

impl<'a> ValueRef<'a, AnyType> {
pub fn try_downcast<T: ArgType>(&self) -> Option<ValueRef<'_, T>> {
pub fn try_downcast<T: ValueType>(&self) -> Option<ValueRef<'_, T>> {
Some(match self {
ValueRef::Scalar(scalar) => ValueRef::Scalar(T::try_downcast_scalar(scalar)?),
ValueRef::Column(col) => ValueRef::Column(T::try_downcast_column(col)?),
Expand Down
1 change: 1 addition & 0 deletions src/query/functions-v2/src/scalars/mod.rs
Expand Up @@ -22,6 +22,7 @@ mod boolean;
mod control;
mod datetime;
mod math;
mod soundex;
mod string;
mod string_multi_args;

Expand Down
44 changes: 44 additions & 0 deletions src/query/functions-v2/src/scalars/soundex.rs
@@ -0,0 +1,44 @@
// Copyright 2021 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

pub(crate) struct Soundex;

impl Soundex {
#[inline(always)]
pub fn number_map(i: char) -> Option<u8> {
match i.to_ascii_lowercase() {
'b' | 'f' | 'p' | 'v' => Some(b'1'),
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => Some(b'2'),
'd' | 't' => Some(b'3'),
'l' => Some(b'4'),
'm' | 'n' => Some(b'5'),
'r' => Some(b'6'),
_ => Some(b'0'),
}
}

#[inline(always)]
pub fn is_drop(c: char) -> bool {
matches!(
c.to_ascii_lowercase(),
'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'h' | 'w'
)
}

// https://github.com/mysql/mysql-server/blob/3290a66c89eb1625a7058e0ef732432b6952b435/sql/item_strfunc.cc#L1919
#[inline(always)]
pub fn is_uni_alphabetic(c: char) -> bool {
('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c as i32 >= 0xC0
}
}
71 changes: 71 additions & 0 deletions src/query/functions-v2/src/scalars/string.rs
Expand Up @@ -17,6 +17,7 @@ use std::io::Write;

use bstr::ByteSlice;
use common_expression::types::number::NumberDomain;
use common_expression::types::number::UInt64Type;
use common_expression::types::string::StringColumn;
use common_expression::types::string::StringColumnBuilder;
use common_expression::types::GenericMap;
Expand All @@ -29,6 +30,8 @@ use common_expression::Value;
use common_expression::ValueRef;
use itertools::izip;

use super::soundex::Soundex;

pub fn register(registry: &mut FunctionRegistry) {
registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
"upper",
Expand Down Expand Up @@ -534,6 +537,74 @@ pub fn register(registry: &mut FunctionRegistry) {
},
),
);

registry.register_1_arg::<StringType, UInt64Type, _, _>(
"ord",
FunctionProperty::default(),
|_| None,
|str: &[u8]| {
let mut res: u64 = 0;
if !str.is_empty() {
if str[0].is_ascii() {
res = str[0] as u64;
} else {
for (p, _) in str.iter().enumerate() {
let s = &str[0..p + 1];
if std::str::from_utf8(s).is_ok() {
for (i, b) in s.iter().rev().enumerate() {
res += (*b as u64) * 256_u64.pow(i as u32);
}
break;
}
}
}
}
res
},
);

registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
"soundex",
FunctionProperty::default(),
|_| None,
vectorize_string_to_string(
|col| usize::max(col.data.len(), 4 * col.len()),
|val, writer| {
let mut last = None;
let mut count = 0;

for ch in String::from_utf8_lossy(val).chars() {
let score = Soundex::number_map(ch);
if last.is_none() {
if !Soundex::is_uni_alphabetic(ch) {
continue;
}
last = score;
writer.put_char(ch.to_ascii_uppercase());
} else {
if !ch.is_ascii_alphabetic()
|| Soundex::is_drop(ch)
|| score.is_none()
|| score == last
{
continue;
}
last = score;
writer.put_char(score.unwrap() as char);
}

count += 1;
}
// add '0'
for _ in count..4 {
writer.put_char('0');
}

writer.commit_row();
Ok(())
},
),
);
}

// Vectorize string to string function with customer estimate_bytes.
Expand Down

1 comment on commit 8202fed

@vercel
Copy link

@vercel vercel bot commented on 8202fed Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

databend – ./

databend-databend.vercel.app
databend-git-main-databend.vercel.app
databend.vercel.app
databend.rs

Please sign in to comment.