Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add soundex function, issue #39880 #48567

Merged
merged 22 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
38 changes: 37 additions & 1 deletion docs/en/sql-reference/functions/string-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ sidebar_label: Strings

# Functions for Working with Strings

:::note
:::note
Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [replacing](../../sql-reference/functions/string-replace-functions.md) in strings are described separately.
:::

Expand Down Expand Up @@ -1193,6 +1193,42 @@ Result:
```

## concatWithSeparatorAssumeInjective

Same as concatWithSeparator, the difference is that you need to ensure that concatWithSeparator(sep, expr1, expr2, expr3...) → result is injective, it will be used for optimization of GROUP BY.

The function is named “injective” if it always returns different result for different values of arguments. In other words: different arguments never yield identical result.

## soundex

Returns the [Soundex code](https://en.wikipedia.org/wiki/Soundex) of a string.

**Syntax**

``` sql
soundex(val)
```

**Arguments**

- `val` - Input value. [String](../data-types/string.md)

**Returned value**

- The Soundex code of the input value. [String](../data-types/string.md)

**Example**

Query:

``` sql
select soundex('aksel');
```

Result:

``` text
┌─soundex('aksel')─┐
│ A240 │
└──────────────────┘
```

12 changes: 12 additions & 0 deletions docs/zh/sql-reference/functions/string-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,15 @@ SELECT format('{} {}', 'Hello', 'World')
## trimBoth(s) {#trimboths}

返回一个字符串,用于删除任一侧的空白字符。

## soundex(s)

返回一个字符串的soundex值。输出类型是FixedString,示例如下:

``` sql
select soundex('aksql');

┌─soundex('aksel')─┐
│ A240 │
└──────────────────┘
```
119 changes: 119 additions & 0 deletions src/Functions/soundex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <cctype>

#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>


namespace DB
{

namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
/** Soundex algorithm, https://en.wikipedia.org/wiki/Soundex
* Implemented similarly as in most SQL dialects:
* 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
* 2. Replace all consonants (include the first letter) with digits as follows:
* - b, f, p, v → 1
* - c, g, j, k, q, s, x, z → 2
* - d, t → 3
* - l → 4
* - m, n → 5
* - r → 6
* 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
* 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
* 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
*/

struct SoundexImpl
{
static constexpr auto length = 4z;
static constexpr auto soundex_map = "01230120022455012623010202";

static void calculate(const char * value, size_t value_length, char * out)
{
const char * cur = value;
const char * const end = value + value_length;
char * const out_end = out + length;

while (cur < end && !isAlphaASCII(*cur))
++cur;

char prev_code = '0';
if (cur < end)
{
*out = toUpperIfAlphaASCII(*cur);
++out;
prev_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
++cur;
}

while (cur < end && !isAlphaASCII(*cur))
++cur;

while (cur < end && out < out_end)
{
char current_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
if ((current_code != '0') && (current_code != prev_code))
{
*out = current_code;
++out;
}
prev_code = current_code;
++cur;

while (cur < end && !isAlphaASCII(*cur))
++cur;
}

while (out < out_end)
{
*out = '0';
++out;
}
}

static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const size_t size = offsets.size();
res_data.resize(size * (length + 1));
res_offsets.resize(size);

size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
const size_t value_length = offsets[i] - prev_offset - 1;
const size_t out_index = i * (length + 1);
calculate(value, value_length, reinterpret_cast<char *>(&res_data[out_index]));
res_data[out_index + length] = '\0';
res_offsets[i] = (out_index + length + 1);
prev_offset = offsets[i];
}
}

[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by soundex function");
}
};

struct NameSoundex
{
static constexpr auto name = "soundex";
};

REGISTER_FUNCTION(Soundex)
{
factory.registerFunction<FunctionStringToString<SoundexImpl, NameSoundex>>(
Documentation{"Returns Soundex code of a string."}, FunctionFactory::CaseInsensitive);
}


}
27 changes: 27 additions & 0 deletions tests/queries/0_stateless/02711_soundex_function.reference
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
0000
0000
J523
A000
F634
F634
J525
J525
J523
M235
M235
S530
S530
---
0000
0000
J523
A000
F634
F634
J525
J525
J523
M235
M235
S530
S530
28 changes: 28 additions & 0 deletions tests/queries/0_stateless/02711_soundex_function.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
SELECT soundex('');
SELECT soundex('12345');
SELECT soundex('341Jons54326ton');
SELECT soundex('A2222222');
SELECT soundex('Fairdale');
SELECT soundex('Faredale');
SELECT soundex('Jon1s2o3n');
SELECT soundex('Jonson');
SELECT soundex('Jonston');
SELECT soundex('M\acDonald22321');
SELECT soundex('MacDonald');
SELECT soundex('S3344mith0000');
SELECT soundex('Smith');

SELECT '---';

-- same input strings but in a table
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (col String) Engine=MergeTree ORDER BY col;
INSERT INTO tab VALUES ('') ('12345') ('341Jons54326ton') ('A2222222') ('Fairdale') ('Faredale') ('Jon1s2o3n') ('Jonson') ('Jonston') ('M\acDonald22321') ('MacDonald') ('S3344mith0000') ('Smith');

SELECT soundex(col) FROM tab;

DROP TABLE tab;

-- negative tests
SELECT soundex(toFixedString('Smith', 5)); -- { serverError ILLEGAL_COLUMN }
SELECT soundex(5); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }