Skip to content

Commit

Permalink
ARROW-15568: [C++][Gandiva] Implement Translate Function (apache#12333)
Browse files Browse the repository at this point in the history
Lead-authored-by: Vinicius Roque <hownatios@gmail.com>
Co-authored-by: ViniciusSouzaRoque <vinicius.roque@dremio.com>
Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
  • Loading branch information
2 people authored and djnavarro committed Jun 28, 2022
1 parent 66ec209 commit 67536ca
Show file tree
Hide file tree
Showing 5 changed files with 336 additions and 1 deletion.
6 changes: 5 additions & 1 deletion cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,11 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
NativeFunction::kNeedsContext),

NativeFunction("instr", {}, DataTypeVector{utf8(), utf8()}, int32(),
kResultNullIfNull, "instr_utf8")};
kResultNullIfNull, "instr_utf8"),

NativeFunction("translate", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(),
kResultNullIfNull, "translate_utf8_utf8_utf8",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};

return string_fn_registry_;
}
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,9 @@ const char* gdv_fn_sha256_timestamp(int64_t context, gdv_timestamp value, bool v
GANDIVA_EXPORT
const char* gdv_fn_sha256_utf8(int64_t context, gdv_utf8 value, int32_t value_length,
bool value_validity, int32_t* out_length);

GANDIVA_EXPORT
const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in_len,
const char* from, int32_t from_len, const char* to,
int32_t to_len, int32_t* out_len);
}
43 changes: 43 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -950,4 +950,47 @@ TEST(TestGdvFnStubs, TestMaskLastN) {
EXPECT_EQ(expected, std::string(result, out_len));
}

TEST(TestGdvFnStubs, TestTranslate) {
gandiva::ExecutionContext ctx;
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;

std::string expected = "ACACACA";
const char* result =
translate_utf8_utf8_utf8(ctx_ptr, "ABABABA", 7, "B", 1, "C", 1, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "acde";
result = translate_utf8_utf8_utf8(ctx_ptr, "a b c d e", 9, " b", 2, "", 0, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "h3110, h0w ar3 y0u/";
result = translate_utf8_utf8_utf8(ctx_ptr, "hello, how are you?", 19, "elo?", 4, "310/",
4, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "1b9ef";
result = translate_utf8_utf8_utf8(ctx_ptr, "abcdef", 6, "adc", 3, "19", 2, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "abcd";
result = translate_utf8_utf8_utf8(ctx_ptr, "a b c d", 7, " ", 1, "", 0, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "1b9c9e1f";
result =
translate_utf8_utf8_utf8(ctx_ptr, "abdcdeaf", 8, "adad", 4, "192", 3, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "012345678";
result = translate_utf8_utf8_utf8(ctx_ptr, "123456789", 9, "987654321", 9, "0123456789",
10, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));

expected = "012345678";
result = translate_utf8_utf8_utf8(ctx_ptr, "987654321", 9, "123456789", 9, "0123456789",
10, &out_len);
EXPECT_EQ(expected, std::string(result, out_len));
}

} // namespace gandiva
233 changes: 233 additions & 0 deletions cpp/src/gandiva/gdv_string_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <utf8proc.h>
#include <string>
#include <unordered_map>
#include <vector>

#include "arrow/util/double_conversion.h"
Expand Down Expand Up @@ -449,6 +450,222 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
*out_len = out_idx;
return out;
}
GANDIVA_EXPORT
const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in_len,
const char* from, int32_t from_len, const char* to,
int32_t to_len, int32_t* out_len) {
if (in_len <= 0) {
*out_len = 0;
return "";
}

if (from_len <= 0) {
*out_len = in_len;
return in;
}

// This variable is to control if there are multi-byte utf8 entries
bool has_multi_byte = false;

// This variable is to store the final result
char* result;
int result_len;

// Searching multi-bytes in In
for (int i = 0; i < in_len; i++) {
unsigned char char_single_byte = in[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}

// Searching multi-bytes in From
if (!has_multi_byte) {
for (int i = 0; i < from_len; i++) {
unsigned char char_single_byte = from[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}
}

// Searching multi-bytes in To
if (!has_multi_byte) {
for (int i = 0; i < to_len; i++) {
unsigned char char_single_byte = to[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}
}

// If there are no multibytes in the input, work only with char
if (!has_multi_byte) {
// This variable is for receive the substitutions
result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));

if (result == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
result_len = 0;

// Creating a Map to mark substitutions to make
std::unordered_map<char, char> subs_list;

// This variable is for controlling the position in entry TO, for never repeat the
// changes
int start_compare;

if (to_len > 0) {
start_compare = 0;
} else {
start_compare = -1;
}

// If the position in TO is out of range, this variable will be associated to map
// list, to mark deletion positions
const char empty = '\0';

for (int in_for = 0; in_for < in_len; in_for++) {
if (subs_list.find(in[in_for]) != subs_list.end()) {
if (subs_list[in[in_for]] != empty) {
// If exist in map, only add the correspondent value in result
result[result_len] = subs_list[in[in_for]];
result_len++;
}
} else {
for (int from_for = 0; from_for <= from_len; from_for++) {
if (from_for == from_len) {
// If it's not in the FROM list, just add it to the map and the result.
subs_list.insert(std::pair<char, char>(in[in_for], in[in_for]));
result[result_len] = in[in_for];
result_len++;
break;
}
if (in[in_for] != from[from_for]) {
// If this character does not exist in FROM list, don't need treatment
continue;
} else if (start_compare == -1 || start_compare == to_len) {
// If exist but the start_compare is out of range, add to map as empty, to
// deletion later
subs_list.insert(std::pair<char, char>(in[in_for], empty));
break;
} else {
// If exist and the start_compare is in range, add to map with the
// corresponding TO in position start_compare
subs_list.insert(std::pair<char, char>(in[in_for], to[start_compare]));
result[result_len] = subs_list[in[in_for]];
result_len++;
start_compare++;
break; // for ignore duplicates entries in FROM, ex: ("adad")
}
}
}
}
} else { // If there are no multibytes in the input, work with std::strings
// This variable is for receive the substitutions, malloc is in_len * 4 to receive
// possible inputs with 4 bytes
result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len * 4));

if (result == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
result_len = 0;

// This map is std::string to store multi-bytes entries
std::unordered_map<std::string, std::string> subs_list;

// This variable is for controlling the position in entry TO, for never repeat the
// changes
int start_compare;

if (to_len > 0) {
start_compare = 0;
} else {
start_compare = -1;
}

// If the position in TO is out of range, this variable will be associated to map
// list, to mark deletion positions
const std::string empty = "";

// This variables is to control len of multi-bytes entries
int len_char_in = 0;
int len_char_from = 0;
int len_char_to = 0;

for (int in_for = 0; in_for < in_len; in_for += len_char_in) {
// Updating len to char in this position
len_char_in = gdv_fn_utf8_char_length(in[in_for]);
// Making copy to std::string with length for this char position
std::string insert_copy_key(in + in_for, len_char_in);
if (subs_list.find(insert_copy_key) != subs_list.end()) {
if (subs_list[insert_copy_key] != empty) {
// If exist in map, only add the correspondent value in result
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
}
} else {
for (int from_for = 0; from_for <= from_len; from_for += len_char_from) {
// Updating len to char in this position
len_char_from = gdv_fn_utf8_char_length(from[from_for]);
// Making copy to std::string with length for this char position
std::string copy_from_compare(from + from_for, len_char_from);
if (from_for == from_len) {
// If it's not in the FROM list, just add it to the map and the result.
std::string insert_copy_value(in + in_for, len_char_in);
// Insert in map to next loops
subs_list.insert(
std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
break;
}

if (insert_copy_key != copy_from_compare) {
// If this character does not exist in FROM list, don't need treatment
continue;
} else if (start_compare == -1 || start_compare >= to_len) {
// If exist but the start_compare is out of range, add to map as empty, to
// deletion later
subs_list.insert(std::pair<std::string, std::string>(insert_copy_key, empty));
break;
} else {
// If exist and the start_compare is in range, add to map with the
// corresponding TO in position start_compare
len_char_to = gdv_fn_utf8_char_length(to[start_compare]);
std::string insert_copy_value(to + start_compare, len_char_to);
// Insert in map to next loops
subs_list.insert(
std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
start_compare += len_char_to;
break; // for ignore duplicates entries in FROM, ex: ("adad")
}
}
}
}
}

*out_len = result_len;
return result;
}
}

namespace gandiva {
Expand Down Expand Up @@ -649,5 +866,21 @@ void ExportedStringFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_initcap_utf8));

// translate_utf8_utf8_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i32_ptr_type() // out_length
};

engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
}
} // namespace gandiva
50 changes: 50 additions & 0 deletions cpp/src/gandiva/tests/projector_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2857,4 +2857,54 @@ TEST_F(TestProjector, TestLCase) {

EXPECT_ARROW_ARRAY_EQUALS(out_1, outputs.at(0));
}
TEST_F(TestProjector, TestTranslate) {
// schema for input fields
auto field0 = field("f0", arrow::utf8());
auto field1 = field("f1", arrow::utf8());
auto field2 = field("f2", arrow::utf8());

auto schema_translate = arrow::schema({field0, field1, field2});

// output fields
auto field_translate = field("translate", arrow::utf8());

// Build expression
auto translate_expr = TreeExprBuilder::MakeExpression(
"translate", {field0, field1, field2}, field_translate);

std::shared_ptr<Projector> projector;
auto status = Projector::Make(schema_translate, {translate_expr}, TestConfiguration(),
&projector);

EXPECT_TRUE(status.ok()) << status.message();

// Create a row-batch with some sample data
int num_records = 7;
auto array0 = MakeArrowArrayUtf8({"a b c d", "abcde", "My Name Is JHONNY", "\n\n/n/n",
"x大路学路x", "学大路学路大", "abcd"},
{true, true, true, true, true, true, true});

auto array1 = MakeArrowArrayUtf8({" ", "bd", "JHONNY", "/\n", "x", "学大", "abc"},
{true, true, true, true, true, true, true});

auto array2 = MakeArrowArrayUtf8({"", "xb", "XXXXX", "a~", "b", "12", "学大路"},
{true, true, true, true, true, true, true});

// expected output
auto exp_translate = MakeArrowArrayUtf8({"abcd", "axcbe", "My Xame Is XXXXXX", "aa~n~n",
"b大路学路b", "12路1路2", "学大路d"},
{true, true, true, true, true, true, true});

// prepare input record batch
auto in_batch =
arrow::RecordBatch::Make(schema_translate, num_records, {array0, array1, array2});

// Evaluate expression
arrow::ArrayVector outputs;
status = projector->Evaluate(*in_batch, pool_, &outputs);
EXPECT_TRUE(status.ok()) << status.message();

// Validate results
EXPECT_ARROW_ARRAY_EQUALS(exp_translate, outputs.at(0));
}
} // namespace gandiva

0 comments on commit 67536ca

Please sign in to comment.