Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Included UTF-8 test cases and benchmark (also with Regex function)
  • Loading branch information
Eduardo Almeida committed Mar 19, 2020
1 parent 106c634 commit 8f61d54
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 17 deletions.
42 changes: 41 additions & 1 deletion benchmark/micro/string.cpp
Expand Up @@ -7,7 +7,7 @@
using namespace duckdb;
using namespace std;

#define STRING_COUNT 10000000
#define STRING_COUNT 1000000
#define STRING_LENGTH 4

#define STRING_DATA_GEN_BODY(STRING_LENGTH) \
Expand Down Expand Up @@ -140,3 +140,43 @@ string BenchmarkInfo() override {
return "STRING LENGTH";
}
FINISH_BENCHMARK(StringAggLong)

DUCKDB_BENCHMARK(StringInstr, "[string]")
STRING_DATA_GEN_BODY(4)
string GetQuery() override {
return "SELECT INSTR(s1, 'h') FROM strings";
}
string BenchmarkInfo() override {
return "STRING INSTR";
}
FINISH_BENCHMARK(StringInstr)

DUCKDB_BENCHMARK(StringInstrNull, "[string]")
STRING_DATA_GEN_BODY(4)
string GetQuery() override {
return "SELECT INSTR(s1, '') FROM strings";
}
string BenchmarkInfo() override {
return "STRING INSTR";
}
FINISH_BENCHMARK(StringInstrNull)

DUCKDB_BENCHMARK(StringRegex, "[string]")
STRING_DATA_GEN_BODY(4)
string GetQuery() override {
return "SELECT REGEXP_MATCHES(s1, 'h') FROM strings";
}
string BenchmarkInfo() override {
return "STRING REGEX";
}
FINISH_BENCHMARK(StringRegex)

DUCKDB_BENCHMARK(StringRegexNull, "[string]")
STRING_DATA_GEN_BODY(4)
string GetQuery() override {
return "SELECT REGEXP_MATCHES(s1, '') FROM strings";
}
string BenchmarkInfo() override {
return "STRING REGEX";
}
FINISH_BENCHMARK(StringRegexNull)
29 changes: 13 additions & 16 deletions src/function/scalar/string/instr.cpp
Expand Up @@ -11,17 +11,16 @@ using namespace std;

namespace duckdb {

static int32_t instr(string_t haystack, string_t needle);
static int64_t instr(string_t haystack, string_t needle);

struct InstrOperator {
template <class TA, class TB, class TR> static inline TR Operation(TA left, TB right) {
return instr(left, right);
}
};

static int32_t instr(string_t haystack, string_t needle) {
int32_t string_position = 0;
unsigned char firstChar;
static int64_t instr(string_t haystack, string_t needle) {
int64_t string_position = 0;

// Getting information about the needle and the haystack
auto input_haystack = haystack.GetData();
Expand All @@ -30,20 +29,18 @@ static int32_t instr(string_t haystack, string_t needle) {
auto size_needle = needle.GetSize();

// Needle needs something to proceed
// Haystack should be bigger than the needle
if ((size_needle > 0) && (size_haystack >= size_needle)) {
firstChar = input_needle[0];
if (size_needle > 0) {

// find the positions with the first letter
while (size_haystack > 0) {
char b = input_haystack[0];
// Haystack should be bigger or equal size to the needle
while (size_haystack >= size_needle) {

// Compare the first letter and with that compare Needle to the Haystack
if ((b == firstChar) && ((memcmp(input_haystack, input_needle, size_needle) == 0))) {
string_position += (b & 0xC0) != 0x80;
// Increment and check continuation bytes: bit 7 should be set and 6 unset
string_position += (input_haystack[0] & 0xC0) != 0x80;

// Compare Needle to the Haystack
if ((memcmp(input_haystack, input_needle, size_needle) == 0)) {
return string_position;
}
string_position += (b & 0xC0) != 0x80;
size_haystack--;
input_haystack++;
}
Expand All @@ -57,8 +54,8 @@ static int32_t instr(string_t haystack, string_t needle) {
void InstrFun::RegisterFunction(BuiltinFunctions &set) {
set.AddFunction(ScalarFunction("instr", // name of the function
{SQLType::VARCHAR, SQLType::VARCHAR}, // argument list
SQLType::INTEGER, // return type
ScalarFunction::BinaryFunction<string_t, string_t, int32_t, InstrOperator, true>));
SQLType::BIGINT, // return type
ScalarFunction::BinaryFunction<string_t, string_t, int64_t, InstrOperator, true>));
}

} // namespace duckdb
56 changes: 56 additions & 0 deletions test/sql/function/test_instr.cpp
Expand Up @@ -61,3 +61,59 @@ TEST_CASE("Instr test", "[function]") {
result = con.Query("SELECT instr(NULL,NULL) FROM strings");
REQUIRE(CHECK_COLUMN(result, 0, {Value(), Value(), Value(), Value()}));
}

/* Inspired by the substring test case and C language UTF-8 tests
*
*/
TEST_CASE("Instr test with UTF8", "[function]") {
unique_ptr<QueryResult> result;
DuckDB db(nullptr);
Connection con(db);
con.EnableQueryVerification();
string atomo = "\xc3\xa1tomo"; //length 6
string portg = "ol\xc3\xa1 mundo";//olá mundo length 9
string nihao = "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c"; //你好世界 length 4
string potpourri = "two \xc3\xb1 three \xE2\x82\xA1 four \xF0\x9F\xA6\x86 end";

REQUIRE_NO_FAIL(con.Query("CREATE TABLE strings(s VARCHAR);"));
REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+atomo+"')"));
REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+portg+"')"));
REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+nihao+"')"));
REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+potpourri+"')"));


// Test one matching UTF8 letter
result = con.Query("SELECT INSTR(s,'\xc3\xa1') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{1,3,0,0}));

// Test a sentence with an UTF-8
result = con.Query("SELECT INSTR(s,'ol\xc3\xa1 mundo') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,1,0,0}));

// Test an entire UTF-8 word
result = con.Query("SELECT INSTR(s,'\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,0,1,0}));

// Test a substring of the haystack from the beginning
result = con.Query("SELECT instr(s,'two \xc3\xb1 thr') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,0,0,1}));

// Test a single UTF8 substring of the haystack in the middle
result = con.Query("SELECT instr(s,'\xc3\xb1') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,0,0,5}));

// Test a multiple UTF8 substring of the haystack in the middle
result = con.Query("SELECT instr(s,'\xE2\x82\xA1 four \xF0\x9F\xA6\x86 e') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,0,0,13}));

// Test a substring of the haystack from the middle to the end
result = con.Query("SELECT instr(s,'\xF0\x9F\xA6\x86 end') FROM strings");
REQUIRE(CHECK_COLUMN(result, 0,
{0,0,0,20}));
}

0 comments on commit 8f61d54

Please sign in to comment.