Skip to content

Commit

Permalink
TokenMecab: add loose_reading option
Browse files Browse the repository at this point in the history
TODO: GET mode support
  • Loading branch information
kou committed Sep 10, 2018
1 parent ee1e948 commit 94e1c76
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 15 deletions.
127 changes: 112 additions & 15 deletions plugins/tokenizers/mecab.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ typedef struct {
grn_bool include_class;
grn_bool include_reading;
grn_bool include_form;
grn_bool loose_reading;
} grn_mecab_tokenizer_options;

typedef struct {
Expand All @@ -61,6 +62,13 @@ typedef struct {
const char *next;
const char *end;
grn_tokenizer_query *query;
struct {
grn_bool ing;
grn_bool need;
grn_bool need_end_mark;
grn_obj readings;
size_t offset;
} loose;
} grn_mecab_tokenizer;

static const char *
Expand Down Expand Up @@ -146,6 +154,7 @@ mecab_tokenizer_options_init(grn_mecab_tokenizer_options *options)
options->include_class = GRN_FALSE;
options->include_reading = GRN_FALSE;
options->include_form = GRN_FALSE;
options->loose_reading = GRN_FALSE;
}

static grn_bool
Expand All @@ -167,6 +176,10 @@ mecab_tokenizer_options_need_default_output(grn_mecab_tokenizer_options *options
return GRN_TRUE;
}

if (options->loose_reading) {
return GRN_TRUE;
}

return GRN_FALSE;
}

Expand Down Expand Up @@ -224,6 +237,12 @@ mecab_tokenizer_options_open(grn_ctx *ctx,
raw_options,
i,
options->include_form);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "loose_reading")) {
options->loose_reading =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->loose_reading);
}
} GRN_OPTION_VALUES_EACH_END();

Expand Down Expand Up @@ -673,6 +692,12 @@ mecab_init(grn_ctx *ctx, grn_tokenizer_query *query)
}
}

tokenizer->loose.ing = GRN_FALSE;
tokenizer->loose.need = GRN_FALSE;
tokenizer->loose.need_end_mark = GRN_FALSE;
GRN_TEXT_INIT(&(tokenizer->loose.readings), GRN_OBJ_VECTOR);
tokenizer->loose.offset = 0;

return tokenizer;
}

Expand Down Expand Up @@ -703,6 +728,27 @@ typedef struct {
grn_bool ignore_asterisk_value;
} add_feature_data;

static size_t
mecab_get_feature(grn_ctx *ctx,
grn_obj *features,
size_t i,
const char **value)
{
size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t);
const char *start;
const char *end;

if (i + 2 > n_locations) {
*value = NULL;
return 0;
}

start = (const char *)(GRN_UINT64_VALUE_AT(features, i));
end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1;
*value = start;
return end - start;
}

static void
mecab_next_default_format_add_feature(grn_ctx *ctx,
add_feature_data *data,
Expand All @@ -711,31 +757,22 @@ mecab_next_default_format_add_feature(grn_ctx *ctx,
{
grn_token *token = data->token;
grn_obj *features = data->features;
size_t n_locations = GRN_BULK_VSIZE(features) / sizeof(uint64_t);
const char *feature_start;
const char *feature_end;
const char *feature = NULL;
size_t feature_length;
grn_obj value;

if (i + 2 > n_locations) {
return;
}

feature_start = (const char *)(GRN_UINT64_VALUE_AT(features, i));
feature_end = ((const char *)(GRN_UINT64_VALUE_AT(features, i + 1))) - 1;
feature_length = feature_end - feature_start;

feature_length = mecab_get_feature(ctx, features, i, &feature);
if (data->ignore_empty_value && feature_length == 0) {
return;
}
if (data->ignore_asterisk_value &&
feature_length == 1 &&
feature_start[0] == '*') {
feature[0] == '*') {
return;
}

GRN_TEXT_INIT(&value, GRN_OBJ_DO_SHALLOW_COPY);
GRN_TEXT_SET(ctx, &value, feature_start, feature_length);
GRN_TEXT_SET(ctx, &value, feature, feature_length);
grn_token_metadata_add(ctx,
grn_token_get_metadata(ctx, token),
name,
Expand All @@ -758,6 +795,38 @@ mecab_next_default_format(grn_ctx *ctx,
size_t surface_length = 0;
grn_obj features;

if (tokenizer->loose.ing && tokenizer->loose.need_end_mark) {
grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
grn_token_set_data(ctx,
token,
GRN_TOKENIZER_END_MARK_UTF8,
GRN_TOKENIZER_END_MARK_UTF8_LEN);
grn_token_set_status(ctx, token, status);
tokenizer->loose.need_end_mark = GRN_FALSE;
return;
}

if (tokenizer->loose.ing) {
grn_tokenizer_status status = GRN_TOKEN_CONTINUE;
const char *reading = NULL;
unsigned int reading_length;

if (tokenizer->loose.offset + 1 ==
grn_vector_size(ctx, &(tokenizer->loose.readings))) {
status = GRN_TOKEN_LAST;
}
reading_length = grn_vector_get_element(ctx,
&(tokenizer->loose.readings),
tokenizer->loose.offset,
&reading,
NULL,
NULL);
grn_token_set_data(ctx, token, reading, reading_length);
grn_token_set_status(ctx, token, status);
tokenizer->loose.offset++;
return;
}

mecab_next_default_format_skip_eos(ctx, tokenizer);
start = surface = tokenizer->next;
GRN_UINT64_INIT(&features, GRN_OBJ_VECTOR);
Expand Down Expand Up @@ -812,7 +881,12 @@ mecab_next_default_format(grn_ctx *ctx,
{
grn_tokenizer_status status;
if (current == end || tokenizer->next == end) {
status = GRN_TOKEN_LAST;
if (tokenizer->loose.need) {
tokenizer->loose.ing = GRN_TRUE;
status = GRN_TOKEN_CONTINUE;
} else {
status = GRN_TOKEN_LAST;
}
} else {
status = GRN_TOKEN_CONTINUE;
}
Expand Down Expand Up @@ -847,6 +921,28 @@ mecab_next_default_format(grn_ctx *ctx,
mecab_next_default_format_add_feature(ctx, &data, "inflected_form", 5);
mecab_next_default_format_add_feature(ctx, &data, "base_form", 6);
}
if (tokenizer->options->loose_reading) {
const char *reading = NULL;
size_t reading_length;
reading_length = mecab_get_feature(ctx, &features, 7, &reading);
if (reading_length > 0) {
tokenizer->loose.need = GRN_TRUE;
tokenizer->loose.need_end_mark = GRN_TRUE;
grn_vector_add_element(ctx,
&(tokenizer->loose.readings),
reading,
reading_length,
0,
GRN_DB_TEXT);
} else {
grn_vector_add_element(ctx,
&(tokenizer->loose.readings),
surface,
surface_length,
0,
GRN_DB_TEXT);
}
}
GRN_OBJ_FIN(ctx, &features);
}

Expand Down Expand Up @@ -943,7 +1039,8 @@ mecab_fin(grn_ctx *ctx, void *user_data)
if (!tokenizer) {
return;
}
grn_obj_unlink(ctx, &(tokenizer->buf));
GRN_OBJ_FIN(ctx, &(tokenizer->loose.readings));
GRN_OBJ_FIN(ctx, &(tokenizer->buf));
GRN_PLUGIN_FREE(ctx, tokenizer);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
tokenize 'TokenMecab("loose_reading", true)' '焼き肉と焼きにくとyakiniku'
[
[
0,
0.0,
0.0
],
[
{
"value": "焼き肉",
"position": 0,
"force_prefix": false
},
{
"value": "と",
"position": 1,
"force_prefix": false
},
{
"value": "焼きにく",
"position": 2,
"force_prefix": false
},
{
"value": "と",
"position": 3,
"force_prefix": false
},
{
"value": "yakiniku",
"position": 4,
"force_prefix": false
},
{
"value": "￰",
"position": 5,
"force_prefix": false
},
{
"value": "ヤキニク",
"position": 6,
"force_prefix": false
},
{
"value": "ト",
"position": 7,
"force_prefix": false
},
{
"value": "ヤキニク",
"position": 8,
"force_prefix": false
},
{
"value": "ト",
"position": 9,
"force_prefix": false
},
{
"value": "yakiniku",
"position": 10,
"force_prefix": false
}
]
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#@on-error omit
tokenize \
'TokenMecab("loose_reading", true)' \
'焼き肉と焼きにくとyakiniku'
#@on-error default

0 comments on commit 94e1c76

Please sign in to comment.