Skip to content

Commit

Permalink
Enable tokenized delimiter on searching
Browse files Browse the repository at this point in the history
It is disabled on loading.
  • Loading branch information
kou committed Dec 26, 2012
1 parent ea633f0 commit 39d295f
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 48 deletions.
2 changes: 2 additions & 0 deletions include/groonga/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ struct _grn_tokenizer_query {
const char *ptr;
unsigned int length;
grn_encoding encoding;
unsigned int flags;
grn_bool have_tokenized_delimiter;
};

/*
Expand Down
4 changes: 3 additions & 1 deletion lib/db.c
Original file line number Diff line number Diff line change
Expand Up @@ -5572,9 +5572,11 @@ grn_obj_set_value(grn_ctx *ctx, grn_obj *obj, grn_id id,
switch (value->header.type) {
case GRN_BULK :
{
unsigned int token_flags = 0;
grn_token *token;
if (v && s &&
(token = grn_token_open(ctx, lexicon, v, s, GRN_TOKEN_ADD))) {
(token = grn_token_open(ctx, lexicon, v, s,
GRN_TOKEN_ADD, token_flags))) {
while (!token->status) {
grn_id tid = grn_token_next(ctx, token);
grn_bulk_write(ctx, &buf, (char *)&tid, sizeof(grn_id));
Expand Down
30 changes: 22 additions & 8 deletions lib/ii.c
Original file line number Diff line number Diff line change
Expand Up @@ -4708,13 +4708,15 @@ index_add(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgr
const char *value, size_t value_len)
{
grn_hash *h;
unsigned int token_flags = 0;
grn_token *token;
grn_ii_updspec **u;
grn_id tid, *tp;
grn_rc r, rc = GRN_SUCCESS;
grn_vgram_buf *sbuf = NULL;
if (!rid) { return GRN_INVALID_ARGUMENT; }
if (!(token = grn_token_open(ctx, lexicon, value, value_len, GRN_TOKEN_ADD))) {
if (!(token = grn_token_open(ctx, lexicon, value, value_len,
GRN_TOKEN_ADD, token_flags))) {
return GRN_NO_MEMORY_AVAILABLE;
}
if (vgram) { sbuf = grn_vgram_buf_open(value_len); }
Expand Down Expand Up @@ -4764,11 +4766,13 @@ index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgr
const char *value, size_t value_len)
{
grn_hash *h;
unsigned int token_flags = 0;
grn_token *token;
grn_ii_updspec **u;
grn_id tid, *tp;
if (!rid) { return GRN_INVALID_ARGUMENT; }
if (!(token = grn_token_open(ctx, lexicon, value, value_len, GRN_TOKEN_DEL))) {
if (!(token = grn_token_open(ctx, lexicon, value, value_len,
GRN_TOKEN_DEL, token_flags))) {
return GRN_NO_MEMORY_AVAILABLE;
}
h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), GRN_HASH_TINY);
Expand Down Expand Up @@ -4828,6 +4832,7 @@ grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned i
{
int j;
grn_value *v;
unsigned int token_flags = 0;
grn_token *token;
grn_rc rc = GRN_SUCCESS;
grn_hash *old, *new;
Expand All @@ -4846,7 +4851,8 @@ grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned i
goto exit;
}
for (j = newvalues->n_values, v = newvalues->values; j; j--, v++) {
if ((token = grn_token_open(ctx, lexicon, v->str, v->str_len, GRN_TOKEN_ADD))) {
if ((token = grn_token_open(ctx, lexicon, v->str, v->str_len,
GRN_TOKEN_ADD, token_flags))) {
while (!token->status) {
if ((tid = grn_token_next(ctx, token))) {
if (!grn_hash_add(ctx, new, &tid, sizeof(grn_id), (void **) &u, NULL)) {
Expand Down Expand Up @@ -4889,7 +4895,8 @@ grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned i
goto exit;
}
for (j = oldvalues->n_values, v = oldvalues->values; j; j--, v++) {
if ((token = grn_token_open(ctx, lexicon, v->str, v->str_len, GRN_TOKEN_DEL))) {
if ((token = grn_token_open(ctx, lexicon, v->str, v->str_len,
GRN_TOKEN_DEL, token_flags))) {
while (!token->status) {
if ((tid = grn_token_next(ctx, token))) {
if (!grn_hash_add(ctx, old, &tid, sizeof(grn_id), (void **) &u, NULL)) {
Expand Down Expand Up @@ -4967,8 +4974,10 @@ grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section,
if (in->u.v.body) {
const char *head = GRN_BULK_HEAD(in->u.v.body);
for (j = in->u.v.n_sections, v = in->u.v.sections; j; j--, v++) {
unsigned int token_flags = 0;
if (v->length &&
(token = grn_token_open(ctx, lexicon, head + v->offset, v->length, mode))) {
(token = grn_token_open(ctx, lexicon, head + v->offset, v->length,
mode, token_flags))) {
while (!token->status) {
if ((tid = grn_token_next(ctx, token))) {
if (posting) { GRN_RECORD_PUT(ctx, posting, tid); }
Expand Down Expand Up @@ -5415,7 +5424,9 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
const char *key;
uint32_t size;
grn_rc rc = GRN_END_OF_DATA;
grn_token *token = grn_token_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET);
unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER;
grn_token *token = grn_token_open(ctx, lexicon, string, string_len,
GRN_TOKEN_GET, token_flags);
if (!token) { return GRN_NO_MEMORY_AVAILABLE; }
if (mode == GRN_OP_UNSPLIT) {
if ((ti = token_info_open(ctx, lexicon, ii, (char *)token->orig, token->orig_blen, 0, EX_BOTH))) {
Expand Down Expand Up @@ -5699,12 +5710,14 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
grn_rc rc = GRN_SUCCESS;
grn_hash *h;
grn_token *token;
unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER;
grn_obj *lexicon = ii->lexicon;
if (!lexicon || !ii || !string || !s || !optarg) { return GRN_INVALID_ARGUMENT; }
if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) {
return GRN_NO_MEMORY_AVAILABLE;
}
if (!(token = grn_token_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET))) {
if (!(token = grn_token_open(ctx, lexicon, string, string_len,
GRN_TOKEN_GET, token_flags))) {
grn_hash_close(ctx, h);
return GRN_NO_MEMORY_AVAILABLE;
}
Expand Down Expand Up @@ -6807,6 +6820,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
ii_buffer->block_buf_size = est_len;
}
if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) {
unsigned int token_flags = 0;
grn_token *token;
grn_id *buffer = ii_buffer->block_buf;
uint32_t block_pos = ii_buffer->block_pos;
Expand All @@ -6818,7 +6832,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid,
buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG;
}
if ((token = grn_token_open(ctx, tmp_lexicon, value,
value_len, GRN_TOKEN_ADD))) {
value_len, GRN_TOKEN_ADD, token_flags))) {
uint32_t pos;
for (pos = 0; !token->status; pos++) {
grn_id tid;
Expand Down
16 changes: 12 additions & 4 deletions lib/token.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ typedef struct {
static grn_obj *
uvector_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
grn_obj *str;
grn_obj *str, *flags;
grn_uvector_tokenizer *tokenizer;
if (!(flags = grn_ctx_pop(ctx))) {
ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: flags");
return NULL;
}
if (!(str = grn_ctx_pop(ctx))) {
ERR(GRN_INVALID_ARGUMENT, "missing argument");
ERR(GRN_INVALID_ARGUMENT, "[tokenizer][uvector] missing argument: string");
return NULL;
}
if (!(tokenizer = GRN_MALLOC(sizeof(grn_uvector_tokenizer)))) {
Expand Down Expand Up @@ -461,7 +465,7 @@ grn_token_fin(void)

grn_token *
grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
grn_token_mode mode)
grn_token_mode mode, unsigned int flags)
{
grn_token *token;
grn_encoding encoding;
Expand All @@ -486,17 +490,21 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
token->status = GRN_TOKEN_DOING;
token->force_prefix = 0;
if (tokenizer) {
grn_obj str_;
grn_obj str_, flags_;
GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
GRN_TEXT_SET_REF(&str_, str, str_len);
GRN_UINT32_INIT(&flags_, 0);
GRN_UINT32_SET(ctx, &flags_, flags);
token->pctx.caller = NULL;
token->pctx.user_data.ptr = NULL;
token->pctx.proc = (grn_proc *)tokenizer;
token->pctx.hooks = NULL;
token->pctx.currh = NULL;
token->pctx.phase = PROC_INIT;
grn_ctx_push(ctx, &str_);
grn_ctx_push(ctx, &flags_);
((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token->pctx.user_data);
grn_obj_close(ctx, &flags_);
grn_obj_close(ctx, &str_);
} else {
int nflags = 0;
Expand Down
5 changes: 4 additions & 1 deletion lib/token.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,11 @@ extern grn_obj *grn_token_uvector;
grn_rc grn_token_init(void);
grn_rc grn_token_fin(void);

#define GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER (0x01L<<0)

GRN_API grn_token *grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str,
size_t str_len, grn_token_mode mode);
size_t str_len, grn_token_mode mode,
unsigned int flags);

GRN_API grn_id grn_token_next(grn_ctx *ctx, grn_token *ng);
GRN_API grn_rc grn_token_close(grn_ctx *ctx, grn_token *ng);
Expand Down
48 changes: 34 additions & 14 deletions lib/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,6 @@ grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
const char *current = str_ptr;
const char *end = str_ptr + str_length;

/* TODO: disabled tokenized delimiter for now.
We should handle it just on query expander -> tokenizer phase not
all phases. */
return GRN_FALSE;

if (encoding != GRN_ENC_UTF8) {
return GRN_FALSE;
}
Expand All @@ -140,7 +135,9 @@ grn_tokenizer_query *
grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
unsigned int normalize_flags)
{
grn_obj *flags = grn_ctx_pop(ctx);
grn_obj *query_str = grn_ctx_pop(ctx);

if (query_str == NULL) {
GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
return NULL;
Expand All @@ -159,6 +156,11 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
}
query->normalized_query = NULL;
query->query_buf = NULL;
if (flags) {
query->flags = GRN_UINT32_VALUE(flags);
} else {
query->flags = 0;
}

{
grn_obj * const table = args[0];
Expand All @@ -177,28 +179,46 @@ grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
&normalizer);
{
grn_obj *normalized_string;
grn_obj *normalized_query;
if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
normalizer = GRN_NORMALIZER_AUTO;
}
normalized_string = grn_string_open_(ctx,
GRN_TEXT_VALUE(query_str),
GRN_TEXT_LEN(query_str),
normalizer,
normalize_flags,
table_encoding);
if (!normalized_string) {
normalized_query = grn_string_open_(ctx,
GRN_TEXT_VALUE(query_str),
GRN_TEXT_LEN(query_str),
normalizer,
normalize_flags,
table_encoding);
if (!normalized_query) {
GRN_PLUGIN_FREE(ctx, query);
return NULL;
}
query->normalized_query = normalized_string;
query->normalized_query = normalized_query;
memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
query_buf[query_length] = '\0';
query->query_buf = query_buf;
query->ptr = query_buf;
query->length = query_length;
}
query->encoding = table_encoding;

if (query->flags & GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER) {
const char *normalized_string;
unsigned int normalized_string_length;

grn_string_get_normalized(ctx,
query->normalized_query,
&normalized_string,
&normalized_string_length,
NULL);
query->have_tokenized_delimiter =
grn_tokenizer_have_tokenized_delimiter(ctx,
normalized_string,
normalized_string_length,
query->encoding);
} else {
query->have_tokenized_delimiter = GRN_FALSE;
}
}
return query;
}
Expand Down
4 changes: 3 additions & 1 deletion plugins/suggest/suggest.c
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,9 @@ learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner)
char keybuf[GRN_TABLE_MAX_KEY_SIZE];
int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id,
keybuf, GRN_TABLE_MAX_KEY_SIZE);
grn_token *token = grn_token_open(ctx, learner->items, keybuf, keylen, 1);
unsigned int token_flags = 0;
grn_token *token = grn_token_open(ctx, learner->items, keybuf, keylen,
GRN_TOKEN_ADD, token_flags);
if (token) {
grn_id tid;
grn_obj *pre_item = &(learner->pre_item);
Expand Down
12 changes: 2 additions & 10 deletions plugins/tokenizers/kytea.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ struct grn_tokenizer_kytea {
std::vector<std::string> tokens;
std::size_t id;
grn_tokenizer_token token;
bool have_tokenized_delimiter;
const char *rest_query_string;
unsigned int rest_query_string_length;

Expand All @@ -163,7 +162,6 @@ struct grn_tokenizer_kytea {
tokens(),
id(0),
token(),
have_tokenized_delimiter(false),
rest_query_string(NULL)
{
}
Expand Down Expand Up @@ -222,13 +220,7 @@ grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
&normalized_string,
&normalized_string_length,
NULL);
tokenizer->have_tokenized_delimiter =
grn_tokenizer_have_tokenized_delimiter(ctx,
normalized_string,
normalized_string_length,
query->encoding);

if (tokenizer->have_tokenized_delimiter) {
if (tokenizer->query->have_tokenized_delimiter) {
tokenizer->rest_query_string = normalized_string;
tokenizer->rest_query_string_length = normalized_string_length;
} else {
Expand Down Expand Up @@ -285,7 +277,7 @@ grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args,
grn_tokenizer_kytea * const tokenizer =
static_cast<grn_tokenizer_kytea *>(user_data->ptr);

if (tokenizer->have_tokenized_delimiter) {
if (tokenizer->query->have_tokenized_delimiter) {
unsigned int rest_query_string_length =
tokenizer->rest_query_string_length;
const char *rest_query_string =
Expand Down
11 changes: 2 additions & 9 deletions plugins/tokenizers/mecab.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ typedef struct {
const char *end;
grn_tokenizer_query *query;
grn_tokenizer_token token;
grn_bool have_tokenized_delimiter;
} grn_mecab_tokenizer;

static grn_encoding
Expand Down Expand Up @@ -138,13 +137,7 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
&normalized_string,
&normalized_string_length,
NULL);
tokenizer->have_tokenized_delimiter =
grn_tokenizer_have_tokenized_delimiter(ctx,
normalized_string,
normalized_string_length,
query->encoding);

if (tokenizer->have_tokenized_delimiter) {
if (query->have_tokenized_delimiter) {
tokenizer->buf = NULL;
tokenizer->next = normalized_string;
tokenizer->end = tokenizer->next + normalized_string_length;
Expand Down Expand Up @@ -200,7 +193,7 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_mecab_tokenizer *tokenizer = user_data->ptr;
grn_encoding encoding = tokenizer->query->encoding;

if (tokenizer->have_tokenized_delimiter) {
if (tokenizer->query->have_tokenized_delimiter) {
tokenizer->next =
grn_tokenizer_tokenized_delimiter_next(ctx,
&(tokenizer->token),
Expand Down

0 comments on commit 39d295f

Please sign in to comment.