diff --git a/benchmark/micro/string.cpp b/benchmark/micro/string.cpp index 203d35f902c..ebe19fb6766 100644 --- a/benchmark/micro/string.cpp +++ b/benchmark/micro/string.cpp @@ -7,7 +7,7 @@ using namespace duckdb; using namespace std; -#define STRING_COUNT 10000000 +#define STRING_COUNT 1000000 #define STRING_LENGTH 4 #define STRING_DATA_GEN_BODY(STRING_LENGTH) \ @@ -140,3 +140,43 @@ string BenchmarkInfo() override { return "STRING LENGTH"; } FINISH_BENCHMARK(StringAggLong) + +DUCKDB_BENCHMARK(StringInstr, "[string]") +STRING_DATA_GEN_BODY(4) +string GetQuery() override { + return "SELECT INSTR(s1, 'h') FROM strings"; +} +string BenchmarkInfo() override { + return "STRING INSTR"; +} +FINISH_BENCHMARK(StringInstr) + +DUCKDB_BENCHMARK(StringInstrNull, "[string]") +STRING_DATA_GEN_BODY(4) +string GetQuery() override { + return "SELECT INSTR(s1, '') FROM strings"; +} +string BenchmarkInfo() override { + return "STRING INSTR"; +} +FINISH_BENCHMARK(StringInstrNull) + +DUCKDB_BENCHMARK(StringRegex, "[string]") +STRING_DATA_GEN_BODY(4) +string GetQuery() override { + return "SELECT REGEXP_MATCHES(s1, 'h') FROM strings"; +} +string BenchmarkInfo() override { + return "STRING REGEX"; +} +FINISH_BENCHMARK(StringRegex) + +DUCKDB_BENCHMARK(StringRegexNull, "[string]") +STRING_DATA_GEN_BODY(4) +string GetQuery() override { + return "SELECT REGEXP_MATCHES(s1, '') FROM strings"; +} +string BenchmarkInfo() override { + return "STRING REGEX"; +} +FINISH_BENCHMARK(StringRegexNull) \ No newline at end of file diff --git a/src/function/scalar/string/instr.cpp b/src/function/scalar/string/instr.cpp index 2ee11323b18..7d577f3b207 100644 --- a/src/function/scalar/string/instr.cpp +++ b/src/function/scalar/string/instr.cpp @@ -11,7 +11,7 @@ using namespace std; namespace duckdb { -static int32_t instr(string_t haystack, string_t needle); +static int64_t instr(string_t haystack, string_t needle); struct InstrOperator { template static inline TR Operation(TA left, TB right) { @@ -19,9 +19,8 @@ struct InstrOperator { } }; -static int32_t instr(string_t haystack, string_t needle) { - int32_t string_position = 0; - unsigned char firstChar; +static int64_t instr(string_t haystack, string_t needle) { + int64_t string_position = 0; // Getting information about the needle and the haystack auto input_haystack = haystack.GetData(); @@ -30,20 +29,18 @@ static int32_t instr(string_t haystack, string_t needle) { auto size_needle = needle.GetSize(); // Needle needs something to proceed - // Haystack should be bigger than the needle - if ((size_needle > 0) && (size_haystack >= size_needle)) { - firstChar = input_needle[0]; + if (size_needle > 0) { - // find the positions with the first letter - while (size_haystack > 0) { - char b = input_haystack[0]; + // Haystack should be bigger or equal size to the needle + while (size_haystack >= size_needle) { - // Compare the first letter and with that compare Needle to the Haystack - if ((b == firstChar) && ((memcmp(input_haystack, input_needle, size_needle) == 0))) { - string_position += (b & 0xC0) != 0x80; + // Increment and check continuation bytes: bit 7 should be set and 6 unset + string_position += (input_haystack[0] & 0xC0) != 0x80; + + // Compare Needle to the Haystack + if ((memcmp(input_haystack, input_needle, size_needle) == 0)) { return string_position; } - string_position += (b & 0xC0) != 0x80; size_haystack--; input_haystack++; } @@ -57,8 +54,8 @@ static int32_t instr(string_t haystack, string_t needle) { void InstrFun::RegisterFunction(BuiltinFunctions &set) { set.AddFunction(ScalarFunction("instr", // name of the function {SQLType::VARCHAR, SQLType::VARCHAR}, // argument list - SQLType::INTEGER, // return type - ScalarFunction::BinaryFunction)); + SQLType::BIGINT, // return type + ScalarFunction::BinaryFunction)); } } // namespace duckdb diff --git a/test/sql/function/test_instr.cpp b/test/sql/function/test_instr.cpp index 35ed81fded2..b8a22579e19 100644 --- a/test/sql/function/test_instr.cpp +++ b/test/sql/function/test_instr.cpp @@ -61,3 +61,59 @@ TEST_CASE("Instr test", "[function]") { result = con.Query("SELECT instr(NULL,NULL) FROM strings"); REQUIRE(CHECK_COLUMN(result, 0, {Value(), Value(), Value(), Value()})); } + +/* Inspired by the substring test case and C language UTF-8 tests +* +*/ +TEST_CASE("Instr test with UTF8", "[function]") { + unique_ptr result; + DuckDB db(nullptr); + Connection con(db); + con.EnableQueryVerification(); + string atomo = "\xc3\xa1tomo"; //length 6 + string portg = "ol\xc3\xa1 mundo";//olá mundo length 9 + string nihao = "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c"; //你好世界 length 4 + string potpourri = "two \xc3\xb1 three \xE2\x82\xA1 four \xF0\x9F\xA6\x86 end"; + + REQUIRE_NO_FAIL(con.Query("CREATE TABLE strings(s VARCHAR);")); + REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+atomo+"')")); + REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+portg+"')")); + REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+nihao+"')")); + REQUIRE_NO_FAIL(con.Query("INSERT INTO strings VALUES ('"+potpourri+"')")); + + + // Test one matching UTF8 letter + result = con.Query("SELECT INSTR(s,'\xc3\xa1') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {1,3,0,0})); + + // Test a sentence with an UTF-8 + result = con.Query("SELECT INSTR(s,'ol\xc3\xa1 mundo') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,1,0,0})); + + // Test an entire UTF-8 word + result = con.Query("SELECT INSTR(s,'\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96\xe7\x95\x8c') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,0,1,0})); + + // Test a substring of the haystack from the beginning + result = con.Query("SELECT instr(s,'two \xc3\xb1 thr') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,0,0,1})); + + // Test a single UTF8 substring of the haystack in the middle + result = con.Query("SELECT instr(s,'\xc3\xb1') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,0,0,5})); + + // Test a multiple UTF8 substring of the haystack in the middle + result = con.Query("SELECT instr(s,'\xE2\x82\xA1 four \xF0\x9F\xA6\x86 e') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,0,0,13})); + + // Test a substring of the haystack from the middle to the end + result = con.Query("SELECT instr(s,'\xF0\x9F\xA6\x86 end') FROM strings"); + REQUIRE(CHECK_COLUMN(result, 0, + {0,0,0,20})); +} \ No newline at end of file