Navigation Menu

Skip to content

Commit

Permalink
TokenMecab: add include_reading option
Browse files Browse the repository at this point in the history
It adds reading to tokens.
  • Loading branch information
kou committed Sep 10, 2018
1 parent 153dafb commit e262c28
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 0 deletions.
20 changes: 20 additions & 0 deletions plugins/tokenizers/mecab.c
Expand Up @@ -50,6 +50,7 @@ typedef struct {
grn_bool chunked_tokenize;
int32_t chunk_size_threshold;
grn_bool include_class;
grn_bool include_reading;
} grn_mecab_tokenizer_options;

typedef struct {
Expand Down Expand Up @@ -142,6 +143,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
options->chunked_tokenize = grn_mecab_chunked_tokenize_enabled;
options->chunk_size_threshold = grn_mecab_chunk_size_threshold;
options->include_class = GRN_FALSE;
options->include_reading = GRN_FALSE;
}

static grn_bool
Expand All @@ -155,6 +157,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
return GRN_TRUE;
}

if (options->include_reading) {
return GRN_TRUE;
}

return GRN_FALSE;
}

Expand Down Expand Up @@ -200,6 +206,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
raw_options,
i,
options->include_class);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "include_reading")) {
options->include_reading =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->include_reading);
}
} GRN_OPTION_VALUES_EACH_END();

Expand Down Expand Up @@ -802,6 +814,14 @@ mecab_next_default_format(grn_ctx *ctx,
mecab_next_default_format_add_feature(ctx, &data, "subclass1", 2);
mecab_next_default_format_add_feature(ctx, &data, "subclass2", 3);
}
if (tokenizer->options->include_reading) {
add_feature_data data;
data.token = token;
data.features = &features;
data.ignore_empty_value = GRN_TRUE;
data.ignore_asterisk_value = GRN_FALSE;
mecab_next_default_format_add_feature(ctx, &data, "reading", 7);
}
GRN_OBJ_FIN(ctx, &features);
}

Expand Down
@@ -0,0 +1,34 @@
tokenize 'TokenMecab("include_reading", true)' '焼き肉と焼きにく'
[
[
0,
0.0,
0.0
],
[
{
"value": "焼き肉",
"position": 0,
"force_prefix": false,
"metadata": {
"reading": "ヤキニク"
}
},
{
"value": "と",
"position": 1,
"force_prefix": false,
"metadata": {
"reading": "ト"
}
},
{
"value": "焼きにく",
"position": 2,
"force_prefix": false,
"metadata": {
"reading": "ヤキニク"
}
}
]
]
@@ -0,0 +1,5 @@
#@on-error omit
tokenize \
'TokenMecab("include_reading", true)' \
'焼き肉と焼きにく'
#@on-error default

0 comments on commit e262c28

Please sign in to comment.