Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(functions): migrate trim functions to new expression framework. #6921

Merged
merged 11 commits into from Aug 2, 2022
165 changes: 165 additions & 0 deletions common/functions-v2/src/scalars/string.rs
Expand Up @@ -193,6 +193,125 @@ pub fn register(registry: &mut FunctionRegistry) {
},
|val| val.first().cloned().unwrap_or_default(),
);

// Trim functions
registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
"ltrim",
FunctionProperty::default(),
|_| None,
vectorize_string_to_string(
|col| col.data.len(),
|val, writer| {
let pos = val.iter().position(|ch| *ch != b' ' && *ch != b'\t');
if let Some(idx) = pos {
writer.put_slice(&val.as_bytes()[idx..]);
}
writer.commit_row();
Ok(())
},
),
);

registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
"rtrim",
FunctionProperty::default(),
|_| None,
vectorize_string_to_string(
|col| col.data.len(),
|val, writer| {
let pos = val.iter().rev().position(|ch| *ch != b' ' && *ch != b'\t');
if let Some(idx) = pos {
writer.put_slice(&val.as_bytes()[..val.len() - idx]);
}
writer.commit_row();
Ok(())
},
),
);

registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
"trim",
FunctionProperty::default(),
|_| None,
vectorize_string_to_string(
|col| col.data.len(),
|val, writer| {
let start_pos = val.iter().position(|ch| *ch != b' ' && *ch != b'\t');
let end_pos = val.iter().rev().position(|ch| *ch != b' ' && *ch != b'\t');
if let (Some(start_idx), Some(end_idx)) = (start_pos, end_pos) {
writer.put_slice(&val.as_bytes()[start_idx..val.len() - end_idx]);
}
writer.commit_row();
Ok(())
},
),
);

registry.register_passthrough_nullable_2_arg::<StringType, StringType, StringType, _, _>(
"trim_leading",
FunctionProperty::default(),
|_, _| None,
vectorize_string_to_string_2_arg(
|col, _| col.data.len(),
|val, trim_str, writer| {
let chunk_size = trim_str.len();
let pos = val.chunks(chunk_size).position(|chunk| chunk != trim_str);
if let Some(idx) = pos {
writer.put_slice(&val.as_bytes()[idx * chunk_size..]);
}
writer.commit_row();
Ok(())
},
),
);

registry.register_passthrough_nullable_2_arg::<StringType, StringType, StringType, _, _>(
"trim_trailing",
FunctionProperty::default(),
|_, _| None,
vectorize_string_to_string_2_arg(
|col, _| col.data.len(),
|val, trim_str, writer| {
let chunk_size = trim_str.len();
let pos = val.rchunks(chunk_size).position(|chunk| chunk != trim_str);
if let Some(idx) = pos {
writer.put_slice(&val.as_bytes()[..val.len() - idx * chunk_size]);
}
writer.commit_row();
Ok(())
},
),
);

registry.register_passthrough_nullable_2_arg::<StringType, StringType, StringType, _, _>(
"trim_both",
FunctionProperty::default(),
|_, _| None,
vectorize_string_to_string_2_arg(
|col, _| col.data.len(),
|val, trim_str, writer| {
let chunk_size = trim_str.len();
let start_pos = val.chunks(chunk_size).position(|chunk| chunk != trim_str);

// Trim all
if start_pos.is_none() {
writer.commit_row();
return Ok(());
}

let end_pos = val.rchunks(chunk_size).position(|chunk| chunk != trim_str);

if let (Some(start_idx), Some(end_idx)) = (start_pos, end_pos) {
writer.put_slice(
&val.as_bytes()[start_idx * chunk_size..val.len() - end_idx * chunk_size],
);
}

writer.commit_row();
Ok(())
},
),
);
}

// Vectorize string to string function with customer estimate_bytes.
Expand All @@ -217,3 +336,49 @@ fn vectorize_string_to_string(
}
}
}

// Vectorize (string, string) -> string function with customer estimate_bytes.
fn vectorize_string_to_string_2_arg(
estimate_bytes: impl Fn(&StringColumn, &StringColumn) -> usize + Copy,
func: impl Fn(&[u8], &[u8], &mut StringColumnBuilder) -> Result<(), String> + Copy,
) -> impl Fn(
ValueRef<StringType>,
ValueRef<StringType>,
&GenericMap,
) -> Result<Value<StringType>, String>
+ Copy {
move |arg1, arg2, _| match (arg1, arg2) {
(ValueRef::Scalar(arg1), ValueRef::Scalar(arg2)) => {
let mut builder = StringColumnBuilder::with_capacity(1, 0);
func(arg1, arg2, &mut builder)?;
Ok(Value::Scalar(builder.build_scalar()))
}
(ValueRef::Scalar(arg1), ValueRef::Column(arg2)) => {
let data_capacity =
estimate_bytes(&StringColumnBuilder::repeat(arg1, 1).build(), &arg2);
let mut builder = StringColumnBuilder::with_capacity(arg2.len(), data_capacity);
for val in arg2.iter() {
func(arg1, val, &mut builder)?;
}
Ok(Value::Column(builder.build()))
}
(ValueRef::Column(arg1), ValueRef::Scalar(arg2)) => {
let data_capacity =
estimate_bytes(&arg1, &StringColumnBuilder::repeat(arg2, 1).build());
let mut builder = StringColumnBuilder::with_capacity(arg1.len(), data_capacity);
for val in arg1.iter() {
func(val, arg2, &mut builder)?;
}
Ok(Value::Column(builder.build()))
}
(ValueRef::Column(arg1), ValueRef::Column(arg2)) => {
let data_capacity = estimate_bytes(&arg1, &arg2);
let mut builder = StringColumnBuilder::with_capacity(arg1.len(), data_capacity);
let iter = arg1.iter().zip(arg2.iter());
for (val1, val2) in iter {
func(val1, val2, &mut builder)?;
}
Ok(Value::Column(builder.build()))
}
}
}
44 changes: 44 additions & 0 deletions common/functions-v2/tests/it/scalars/parser.rs
Expand Up @@ -112,6 +112,50 @@ pub fn transform_expr(ast: common_ast::ast::Expr, columns: &[(&str, DataType)])
transform_expr(*right, columns),
],
},
common_ast::ast::Expr::Trim {
span,
expr,
trim_where,
} => {
if let Some(inner) = trim_where {
match inner.0 {
common_ast::ast::TrimWhere::Both => RawExpr::FunctionCall {
span: transform_span(span),
name: "trim_both".to_string(),
params: vec![],
args: vec![
transform_expr(*expr, columns),
transform_expr(*inner.1, columns),
],
},
common_ast::ast::TrimWhere::Leading => RawExpr::FunctionCall {
span: transform_span(span),
name: "trim_leading".to_string(),
params: vec![],
args: vec![
transform_expr(*expr, columns),
transform_expr(*inner.1, columns),
],
},
common_ast::ast::TrimWhere::Trailing => RawExpr::FunctionCall {
span: transform_span(span),
name: "trim_trailing".to_string(),
params: vec![],
args: vec![
transform_expr(*expr, columns),
transform_expr(*inner.1, columns),
],
},
}
} else {
RawExpr::FunctionCall {
span: transform_span(span),
name: "trim".to_string(),
params: vec![],
args: vec![transform_expr(*expr, columns)],
}
}
}
_ => unimplemented!(),
}
}
Expand Down