Skip to content

Commit

Permalink
TokenNgram: add "remove_blank" option
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 6, 2018
1 parent e30e135 commit 294e0a4
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 1 deletion.
9 changes: 8 additions & 1 deletion lib/tokenizers.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ typedef struct {
grn_bool uni_digit;
grn_bool uni_symbol;
grn_bool ignore_blank;
grn_bool remove_blank;
grn_bool loose_symbol;
} grn_ngram_options;

Expand Down Expand Up @@ -278,6 +279,7 @@ ngram_options_init(grn_ngram_options *options, uint8_t unit)
options->uni_digit = GRN_TRUE;
options->uni_symbol = GRN_TRUE;
options->ignore_blank = GRN_FALSE;
options->remove_blank = grn_ngram_tokenizer_remove_blank_enable;
options->loose_symbol = GRN_FALSE;
}

Expand Down Expand Up @@ -359,7 +361,7 @@ ngram_init_raw(grn_ctx *ctx,
unsigned int normalized_length_in_bytes;
grn_ngram_tokenizer *tokenizer;

if (!grn_ngram_tokenizer_remove_blank_enable) {
if (!options->remove_blank) {
normalize_flags &= ~GRN_STRING_REMOVE_BLANK;
}
query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
Expand Down Expand Up @@ -528,6 +530,11 @@ ngram_open_options(grn_ctx *ctx,
raw_options,
i,
options->unit);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "remove_blank")) {
options->remove_blank = grn_vector_get_element_bool(ctx,
raw_options,
i,
options->remove_blank);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_symbol")) {
options->loose_symbol = grn_vector_get_element_bool(ctx,
raw_options,
Expand Down
45 changes: 45 additions & 0 deletions test/command/suite/tokenizers/ngram/remove_blank.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
tokenize 'TokenNgram("remove_blank", false)' " a b c " NormalizerAuto
[
[
0,
0.0,
0.0
],
[
{
"value": " ",
"position": 0,
"force_prefix": false
},
{
"value": "a",
"position": 1,
"force_prefix": false
},
{
"value": " ",
"position": 2,
"force_prefix": false
},
{
"value": "b",
"position": 3,
"force_prefix": false
},
{
"value": " ",
"position": 4,
"force_prefix": false
},
{
"value": "c",
"position": 5,
"force_prefix": false
},
{
"value": " ",
"position": 6,
"force_prefix": true
}
]
]
4 changes: 4 additions & 0 deletions test/command/suite/tokenizers/ngram/remove_blank.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
tokenize \
'TokenNgram("remove_blank", false)' \
" a b c " \
NormalizerAuto

0 comments on commit 294e0a4

Please sign in to comment.