Navigation Menu

Skip to content

Commit

Permalink
tokenizers regexp: skip the last one character token
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 2, 2015
1 parent d51ca4a commit 5f42bd9
Showing 1 changed file with 26 additions and 6 deletions.
32 changes: 26 additions & 6 deletions lib/tokenizers.c
Expand Up @@ -478,6 +478,7 @@ typedef struct {
} get;
grn_bool is_begin;
grn_bool is_end;
grn_bool is_first_token;
grn_bool is_overlapping;
const char *next;
const char *end;
Expand Down Expand Up @@ -515,6 +516,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)

tokenizer->is_begin = GRN_TRUE;
tokenizer->is_end = GRN_FALSE;
tokenizer->is_first_token = GRN_TRUE;
tokenizer->is_overlapping = GRN_FALSE;

grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
Expand Down Expand Up @@ -659,22 +661,40 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
tokenizer->is_overlapping = (n_characters > 1);

if (tokenizer->next == end) {
tokenizer->is_end = GRN_TRUE;
if (mode == GRN_TOKEN_GET) {
if (!tokenizer->get.have_end) {
if (mode == GRN_TOKEN_GET) {
if ((end - tokenizer->next) < ngram_unit) {
if (tokenizer->get.have_end) {
if (tokenizer->next == end) {
tokenizer->is_end = GRN_TRUE;
}
if (status & GRN_TOKEN_UNMATURED) {
if (tokenizer->is_first_token) {
status |= GRN_TOKEN_FORCE_PREFIX;
} else {
status |= GRN_TOKEN_SKIP;
}
}
} else {
tokenizer->is_end = GRN_TRUE;
status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
} else if (status & GRN_TOKEN_UNMATURED) {
status |= GRN_TOKEN_FORCE_PREFIX;
if (status & GRN_TOKEN_UNMATURED) {
status |= GRN_TOKEN_FORCE_PREFIX;
}
}
}
} else {
if (tokenizer->next == end) {
tokenizer->is_end = GRN_TRUE;
}
}

grn_tokenizer_token_push(ctx,
&(tokenizer->token),
GRN_TEXT_VALUE(buffer),
GRN_TEXT_LEN(buffer),
status);
tokenizer->is_first_token = GRN_FALSE;

return NULL;
}

Expand Down

0 comments on commit 5f42bd9

Please sign in to comment.