Navigation Menu

Skip to content

Commit

Permalink
TokenMecab: add include_form option
Browse files Browse the repository at this point in the history
It adds base form, inflected form and inflected type to tokens.
  • Loading branch information
kou committed Sep 10, 2018
1 parent 1ed550f commit 818b0f9
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
22 changes: 22 additions & 0 deletions plugins/tokenizers/mecab.c
Expand Up @@ -51,6 +51,7 @@ typedef struct {
int32_t chunk_size_threshold;
grn_bool include_class;
grn_bool include_reading;
grn_bool include_form;
} grn_mecab_tokenizer_options;

typedef struct {
Expand Down Expand Up @@ -144,6 +145,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
options->chunk_size_threshold = grn_mecab_chunk_size_threshold;
options->include_class = GRN_FALSE;
options->include_reading = GRN_FALSE;
options->include_form = GRN_FALSE;
}

static grn_bool
Expand All @@ -161,6 +163,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
return GRN_TRUE;
}

if (options->include_form) {
return GRN_TRUE;
}

return GRN_FALSE;
}

Expand Down Expand Up @@ -212,6 +218,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
raw_options,
i,
options->include_reading);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "include_form")) {
options->include_form =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->include_form);
}
} GRN_OPTION_VALUES_EACH_END();

Expand Down Expand Up @@ -825,6 +837,16 @@ mecab_next_default_format(grn_ctx *ctx,
data.ignore_asterisk_value = GRN_FALSE;
mecab_next_default_format_add_feature(ctx, &data, "reading", 7);
}
if (tokenizer->options->include_form) {
add_feature_data data;
data.token = token;
data.features = &features;
data.ignore_empty_value = GRN_TRUE;
data.ignore_asterisk_value = GRN_TRUE;
mecab_next_default_format_add_feature(ctx, &data, "inflected_type", 4);
mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
}
GRN_OBJ_FIN(ctx, &features);
}

Expand Down
40 changes: 40 additions & 0 deletions test/command/suite/tokenizers/mecab/options/include_form.expected
@@ -0,0 +1,40 @@
tokenize 'TokenMecab("include_form", true)' '行きました'
[
[
0,
0.0,
0.0
],
[
{
"value": "行き",
"position": 0,
"force_prefix": false,
"metadata": {
"inflected_type": "五段・カ行促音便",
"inflected_form": "連用形",
"base_form": "行く"
}
},
{
"value": "まし",
"position": 1,
"force_prefix": false,
"metadata": {
"inflected_type": "特殊・マス",
"inflected_form": "連用形",
"base_form": "ます"
}
},
{
"value": "た",
"position": 2,
"force_prefix": false,
"metadata": {
"inflected_type": "特殊・タ",
"inflected_form": "基本形",
"base_form": "た"
}
}
]
]
5 changes: 5 additions & 0 deletions test/command/suite/tokenizers/mecab/options/include_form.test
@@ -0,0 +1,5 @@
#@on-error omit
tokenize \
'TokenMecab("include_form", true)' \
'行きました'
#@on-error default

0 comments on commit 818b0f9

Please sign in to comment.