Navigation Menu

Skip to content

Commit

Permalink
TokenNgram: add unify_digit option
Browse files Browse the repository at this point in the history
TokenNgram("unify_digit", false) == TokenBigramSplitDigit (not exist ;p)
  • Loading branch information
kou committed Sep 20, 2018
1 parent 02c79b1 commit 95f8fd3
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 0 deletions.
6 changes: 6 additions & 0 deletions lib/tokenizers.c
Expand Up @@ -736,6 +736,12 @@ ngram_open_options(grn_ctx *ctx,
raw_options,
i,
options->unify_alphabet);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_digit")) {
options->unify_digit =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->unify_digit);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_symbol")) {
options->unify_symbol =
grn_vector_get_element_bool(ctx,
Expand Down
40 changes: 40 additions & 0 deletions test/command/suite/tokenizers/ngram/unify_digit.expected
@@ -0,0 +1,40 @@
tokenize 'TokenNgram("unify_digit", false)' "012345" NormalizerAuto
[
[
0,
0.0,
0.0
],
[
{
"value": "01",
"position": 0,
"force_prefix": false
},
{
"value": "12",
"position": 1,
"force_prefix": false
},
{
"value": "23",
"position": 2,
"force_prefix": false
},
{
"value": "34",
"position": 3,
"force_prefix": false
},
{
"value": "45",
"position": 4,
"force_prefix": false
},
{
"value": "5",
"position": 5,
"force_prefix": false
}
]
]
4 changes: 4 additions & 0 deletions test/command/suite/tokenizers/ngram/unify_digit.test
@@ -0,0 +1,4 @@
tokenize \
'TokenNgram("unify_digit", false)' \
"012345" \
NormalizerAuto

0 comments on commit 95f8fd3

Please sign in to comment.