Skip to content

Commit

Permalink
builtins: add fuzzystrmatch metaphone()
Browse files Browse the repository at this point in the history
Release note (sql change): metaphone() builtin function was added
  • Loading branch information
charlespnh authored and rafiss committed Dec 28, 2023
1 parent bed7aa7 commit d125556
Show file tree
Hide file tree
Showing 8 changed files with 574 additions and 8 deletions.
4 changes: 3 additions & 1 deletion docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,8 @@ available replica will error.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="levenshtein"></a><code>levenshtein(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>, ins_cost: <a href="int.html">int</a>, del_cost: <a href="int.html">int</a>, sub_cost: <a href="int.html">int</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Calculates the Levenshtein distance between two strings. The cost parameters specify how much to charge for each edit operation. Maximum input length is 255 characters.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="metaphone"></a><code>metaphone(source: <a href="string.html">string</a>, max_output_length: <a href="int.html">int</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert a string to its Metaphone code. Maximum input length is 255 characters</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="soundex"></a><code>soundex(source: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Convert a string to its Soundex code.</p>
</span></td><td>Immutable</td></tr></tbody>
</table>
Expand Down Expand Up @@ -2870,7 +2872,7 @@ Can be used to define the tile bounds required by ST_AsMVTGeom to convert geomet
</span></td><td>Immutable</td></tr>
<tr><td><a name="decompress"></a><code>decompress(data: <a href="bytes.html">bytes</a>, codec: <a href="string.html">string</a>) &rarr; <a href="bytes.html">bytes</a></code></td><td><span class="funcdesc"><p>Decompress <code>data</code> with the specified <code>codec</code> (<code>gzip</code>, ‘lz4’, ‘snappy’, 'zstd).</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="difference"></a><code>difference(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Convert two strings to their Soundex codes and then reports the number of matching code positions.</p>
<tr><td><a name="difference"></a><code>difference(source: <a href="string.html">string</a>, target: <a href="string.html">string</a>) &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Convert two strings to their Soundex codes and report the number of matching code positions.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="encode"></a><code>encode(data: <a href="bytes.html">bytes</a>, format: <a href="string.html">string</a>) &rarr; <a href="string.html">string</a></code></td><td><span class="funcdesc"><p>Encodes <code>data</code> using <code>format</code> (<code>hex</code> / <code>escape</code> / <code>base64</code>).</p>
</span></td><td>Immutable</td></tr>
Expand Down
10 changes: 10 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/fuzzystrmatch
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,13 @@ query TTTI
SELECT soundex('Anne'), soundex(NULL), difference('Anne', NULL), difference(NULL, 'Bob');
----
A500 NULL NULL NULL

query TT
SELECT metaphone('GUMBO', 4), metaphone(NULL, 4);
----
KM NULL

query TTT
SELECT metaphone('Night', 4), metaphone('Knight', 4), metaphone('Knives', 4);
----
NFT NFT NFS
37 changes: 33 additions & 4 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -3972,7 +3972,7 @@ value if you rely on the HLC for accuracy.`,
diff := fuzzystrmatch.Difference(s, t)
return tree.NewDInt(tree.DInt(diff)), nil
},
Info: "Convert two strings to their Soundex codes and then reports the number of matching code positions.",
Info: "Convert two strings to their Soundex codes and report the number of matching code positions.",
Volatility: volatility.Immutable,
},
),
Expand Down Expand Up @@ -4012,10 +4012,39 @@ value if you rely on the HLC for accuracy.`,
Info: "Calculates the Levenshtein distance between two strings. The cost parameters specify how much to " +
"charge for each edit operation. Maximum input length is 255 characters.",
Volatility: volatility.Immutable,
}),
},
),
"levenshtein_less_equal": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
"metaphone": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
"dmetaphone_alt": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
"metaphone": makeBuiltin(
tree.FunctionProperties{Category: builtinconstants.CategoryFuzzyStringMatching},
tree.Overload{
Types: tree.ParamTypes{{Name: "source", Typ: types.String}, {Name: "max_output_length", Typ: types.Int}},
ReturnType: tree.FixedReturnType(types.String),
Fn: func(_ context.Context, _ *eval.Context, args tree.Datums) (tree.Datum, error) {
const maxDefaultLen = 255
s := string(tree.MustBeDString(args[0]))
maxOutputLen := int(tree.MustBeDInt(args[1]))
if len(s) > maxDefaultLen {
return nil, pgerror.Newf(pgcode.InvalidParameterValue,
"argument exceeds maximum length of %d characters", maxDefaultLen)
}
if maxOutputLen > maxDefaultLen {
return nil, pgerror.Newf(pgcode.InvalidParameterValue,
"output exceeds maximum length of %d characters", maxDefaultLen)
}
if maxOutputLen <= 0 {
return nil, pgerror.Newf(pgcode.InvalidParameterValue,
"output length must be > 0")
}
m := fuzzystrmatch.Metaphone(s, maxDefaultLen)
return tree.NewDString(m), nil
},
Info: "Convert a string to its Metaphone code. Maximum input length is 255 characters",
Volatility: volatility.Immutable,
},
),
"dmetaphone": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),
"dmetaphone_alt": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 56820, Category: builtinconstants.CategoryFuzzyStringMatching}),

// JSON functions.
// The behavior of both the JSON and JSONB data types in CockroachDB is
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -2538,6 +2538,7 @@ var builtinOidsArray = []string{
2570: `array_position(array: refcursor[], elem: refcursor, start: int) -> int`,
2571: `bit_count(val: bytes) -> int`,
2572: `bit_count(val: varbit) -> int`,
2573: `metaphone(source: string, max_output_length: int) -> string`,
}

var builtinOidsBySignature map[string]oid.Oid
Expand Down
2 changes: 2 additions & 0 deletions pkg/util/fuzzystrmatch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go_library(
name = "fuzzystrmatch",
srcs = [
"leven.go",
"metaphone.go",
"soundex.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/util/fuzzystrmatch",
Expand All @@ -15,6 +16,7 @@ go_test(
size = "small",
srcs = [
"leven_test.go",
"metaphone_test.go",
"soundex_test.go",
],
embed = [":fuzzystrmatch"],
Expand Down

0 comments on commit d125556

Please sign in to comment.