From 91cea774c53d24d1ff5c069b68c7b55f83d01e6e Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 5 Apr 2018 15:30:28 +0900 Subject: [PATCH] grn_obj_set_info: accept tokenizer as text bulk You can use tokenizer name or tokenizer(options, ...) format for it. --- lib/db.c | 223 +++++++++++++++++++++++++++++++++++++++++-------- lib/expr.c | 68 ++++++++++++++- lib/grn_expr.h | 6 ++ 3 files changed, 262 insertions(+), 35 deletions(-) diff --git a/lib/db.c b/lib/db.c index 818de600c6..efac6ed3e7 100644 --- a/lib/db.c +++ b/lib/db.c @@ -38,6 +38,7 @@ #include "grn_util.h" #include "grn_cache.h" #include "grn_window_functions.h" +#include "grn_expr.h" #include #include @@ -8953,15 +8954,201 @@ grn_obj_set_info_source(grn_ctx *ctx, grn_obj *obj, grn_obj *value) return rc; } +static grn_bool +grn_obj_set_info_is_funcall_call_bulk(grn_ctx *ctx, grn_obj *bulk) +{ + const char *current; + const char *end; + + current = GRN_TEXT_VALUE(bulk); + end = current + GRN_TEXT_LEN(bulk); + while (current < end) { + int char_length; + + char_length = grn_charlen(ctx, current, end); + if (char_length != 1) { + return GRN_TRUE; + } + + if (current[0] == '(') { + return GRN_TRUE; + } + + current += char_length; + } + + return GRN_FALSE; +} + +static grn_rc +grn_obj_set_info_require_key_table(grn_ctx *ctx, + grn_obj *table, + const char *context_tag) +{ + switch (table->header.type) { + case GRN_TABLE_HASH_KEY : + case GRN_TABLE_PAT_KEY : + case GRN_TABLE_DAT_KEY : + return ctx->rc; + default : + ERR(GRN_INVALID_ARGUMENT, + "%s target object must be one of " + "GRN_TABLE_HASH_KEY, GRN_TABLE_PAT_KEY and GRN_TABLE_DAT_KEY: <%s>", + context_tag, + grn_obj_type_to_string(table->header.type)); + return ctx->rc; + } +} + +static grn_rc +grn_obj_set_info_default_tokenizer(grn_ctx *ctx, + grn_obj *table, + grn_obj *default_tokenizer) +{ + const char *tag = "[info][set][default-tokenizer]"; + char name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int name_size; + grn_obj *tokenizer = NULL; + grn_id tokenizer_id = GRN_ID_NIL; + grn_obj *expression = NULL; + grn_obj options; + + GRN_TEXT_INIT(&options, GRN_OBJ_VECTOR); + + if (grn_obj_set_info_require_key_table(ctx, table, tag) != GRN_SUCCESS) { + goto exit; + } + + name_size = grn_obj_name(ctx, table, name, sizeof(name)); + if (grn_obj_is_text_family_bulk(ctx, default_tokenizer)) { + if (grn_obj_set_info_is_funcall_call_bulk(ctx, default_tokenizer)) { + grn_obj *unused; + GRN_EXPR_CREATE_FOR_QUERY(ctx, table, expression, unused); + grn_expr_parse(ctx, + expression, + GRN_TEXT_VALUE(default_tokenizer), + GRN_TEXT_LEN(default_tokenizer), + NULL, + GRN_OP_MATCH, + GRN_OP_AND, + GRN_EXPR_SYNTAX_SCRIPT); + if (ctx->rc != GRN_SUCCESS) { + ERR(GRN_INVALID_ARGUMENT, + "%s[%.*s] failed to parse tokenizer options: <%.*s>: %s", + tag, + (int)name_size, + name, + (int)GRN_TEXT_LEN(default_tokenizer), + GRN_TEXT_VALUE(default_tokenizer), + ctx->errbuf); + goto exit; + } + if (!grn_expr_is_simple_function_call(ctx, expression)) { + ERR(GRN_INVALID_ARGUMENT, + "%s[%.*s] must be Tokenizer(option1, option2, ...) format: <%.*s>", + tag, + (int)name_size, + name, + (int)GRN_TEXT_LEN(default_tokenizer), + GRN_TEXT_VALUE(default_tokenizer)); + goto exit; + } + tokenizer = grn_expr_simple_function_call_get_function(ctx, expression); + grn_expr_simple_function_call_get_arguments(ctx, expression, &options); + } else { + tokenizer = grn_ctx_get(ctx, + GRN_TEXT_VALUE(default_tokenizer), + GRN_TEXT_LEN(default_tokenizer)); + if (!tokenizer) { + ERR(GRN_INVALID_ARGUMENT, + "%s[%.*s] unknown tokenizer: <%.*s>", + tag, + (int)name_size, + name, + (int)GRN_TEXT_LEN(default_tokenizer), + GRN_TEXT_VALUE(default_tokenizer)); + goto exit; + } + } + } else { + tokenizer = default_tokenizer; + } + + if (tokenizer && !grn_obj_is_tokenizer_proc(ctx, tokenizer)) { + char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int tokenizer_name_size; + + tokenizer_name_size = grn_obj_name(ctx, + tokenizer, + tokenizer_name, + sizeof(tokenizer_name)); + ERR(GRN_INVALID_ARGUMENT, + "%s[%.*s] invalid tokenizer: <%.*s>", + tag, + (int)name_size, + name, + (int)tokenizer_name_size, + tokenizer_name); + goto exit; + } + + if (tokenizer) { + tokenizer_id = grn_obj_id(ctx, tokenizer); + } + switch (DB_OBJ(table)->header.type) { + case GRN_TABLE_HASH_KEY : + grn_table_tokenizer_set_proc(ctx, + &(((grn_hash *)table)->tokenizer), + tokenizer); + ((grn_hash *)table)->header.common->tokenizer = tokenizer_id; + break; + case GRN_TABLE_PAT_KEY : + grn_table_tokenizer_set_proc(ctx, + &(((grn_pat *)table)->tokenizer), + tokenizer); + ((grn_pat *)table)->header->tokenizer = tokenizer_id; + grn_pat_cache_enable(ctx, + ((grn_pat *)table), + GRN_TABLE_PAT_KEY_CACHE_SIZE); + break; + case GRN_TABLE_DAT_KEY : + grn_table_tokenizer_set_proc(ctx, + &(((grn_dat *)table)->tokenizer), + tokenizer); + ((grn_dat *)table)->header->tokenizer = tokenizer_id; + break; + default : + break; + } + + if (grn_vector_size(ctx, &options) > 0) { + grn_obj_set_option_values(ctx, table, "tokenizer", -1, &options); + } + +exit : + GRN_OBJ_FIN(ctx, &options); + + if (expression) { + grn_obj_close(ctx, expression); + } + + return ctx->rc; +} + static grn_rc grn_obj_set_info_token_filters(grn_ctx *ctx, grn_obj *table, grn_obj *token_filters) { + const char *tag = "[info][set][token-filters]"; grn_obj *current_token_filters; unsigned int i, n_current_token_filters, n_token_filters; grn_obj token_filter_names; + if (grn_obj_set_info_require_key_table(ctx, table, tag) != GRN_SUCCESS) { + return ctx->rc; + } + switch (table->header.type) { case GRN_TABLE_HASH_KEY : current_token_filters = &(((grn_hash *)table)->token_filters); @@ -8973,12 +9160,7 @@ grn_obj_set_info_token_filters(grn_ctx *ctx, current_token_filters = &(((grn_dat *)table)->token_filters); break; default : - /* TODO: Show type name instead of type ID */ - ERR(GRN_INVALID_ARGUMENT, - "[info][set][token-filters] target object must be one of " - "GRN_TABLE_HASH_KEY, GRN_TABLE_PAT_KEY and GRN_TABLE_DAT_KEY: %d", - table->header.type); - return ctx->rc; + break; } n_current_token_filters = @@ -9036,34 +9218,7 @@ grn_obj_set_info(grn_ctx *ctx, grn_obj *obj, grn_info_type type, grn_obj *value) rc = grn_obj_set_info_source(ctx, obj, value); break; case GRN_INFO_DEFAULT_TOKENIZER : - if (!value || DB_OBJ(value)->header.type == GRN_PROC) { - switch (DB_OBJ(obj)->header.type) { - case GRN_TABLE_HASH_KEY : - grn_table_tokenizer_set_proc(ctx, - &(((grn_hash *)obj)->tokenizer), - value); - ((grn_hash *)obj)->header.common->tokenizer = grn_obj_id(ctx, value); - rc = GRN_SUCCESS; - break; - case GRN_TABLE_PAT_KEY : - grn_table_tokenizer_set_proc(ctx, - &(((grn_pat *)obj)->tokenizer), - value); - ((grn_pat *)obj)->header->tokenizer = grn_obj_id(ctx, value); - grn_pat_cache_enable(ctx, - ((grn_pat *)obj), - GRN_TABLE_PAT_KEY_CACHE_SIZE); - rc = GRN_SUCCESS; - break; - case GRN_TABLE_DAT_KEY : - grn_table_tokenizer_set_proc(ctx, - &(((grn_dat *)obj)->tokenizer), - value); - ((grn_dat *)obj)->header->tokenizer = grn_obj_id(ctx, value); - rc = GRN_SUCCESS; - break; - } - } + rc = grn_obj_set_info_default_tokenizer(ctx, obj, value); break; case GRN_INFO_NORMALIZER : if (!value || DB_OBJ(value)->header.type == GRN_PROC) { diff --git a/lib/expr.c b/lib/expr.c index 5e8d487bdb..2023f3fcab 100644 --- a/lib/expr.c +++ b/lib/expr.c @@ -995,7 +995,8 @@ grn_expr_append_obj(grn_ctx *ctx, grn_obj *expr, grn_obj *obj, grn_operator op, } if (!(grn_obj_is_function_proc(ctx, proc) || grn_obj_is_scorer_proc(ctx, proc) || - grn_obj_is_window_function_proc(ctx, proc))) { + grn_obj_is_window_function_proc(ctx, proc) || + grn_obj_is_tokenizer_proc(ctx, proc))) { grn_obj buffer; GRN_TEXT_INIT(&buffer, 0); @@ -9799,3 +9800,68 @@ grn_expr_estimate_size(grn_ctx *ctx, grn_obj *expr) #endif GRN_API_RETURN(size); } + +grn_bool +grn_expr_is_simple_function_call(grn_ctx *ctx, grn_obj *expr) +{ + grn_expr *e = (grn_expr *)expr; + grn_expr_code *codes = e->codes; + grn_expr_code *codes_end = codes + e->codes_curr; + + if (codes == codes_end) { + return GRN_FALSE; + } + + for (; codes < codes_end; codes++) { + switch (codes[0].op) { + case GRN_OP_PUSH : + break; + case GRN_OP_CALL : + if (codes + 1 != codes_end) { + return GRN_FALSE; + } + break; + default : + return GRN_FALSE; + } + } + + return GRN_TRUE; +} + +grn_obj * +grn_expr_simple_function_call_get_function(grn_ctx *ctx, grn_obj *expr) +{ + grn_expr *e = (grn_expr *)expr; + + return e->codes[0].value; +} + +grn_rc +grn_expr_simple_function_call_get_arguments(grn_ctx *ctx, + grn_obj *expr, + grn_obj *arguments) +{ + grn_expr *e = (grn_expr *)expr; + grn_expr_code *codes = e->codes; + grn_expr_code *codes_end = codes + e->codes_curr; + + for (codes++; codes < codes_end - 1; codes++) { + grn_obj *value = codes[0].value; + switch (codes[0].op) { + case GRN_OP_PUSH : + grn_vector_add_element(ctx, + arguments, + GRN_BULK_HEAD(value), + GRN_BULK_VSIZE(value), + 0, + value->header.domain); + break; + default : + return GRN_INVALID_ARGUMENT; + break; + } + } + + return GRN_SUCCESS; +} diff --git a/lib/grn_expr.h b/lib/grn_expr.h index 97c384d8ad..1aa1fcd39a 100644 --- a/lib/grn_expr.h +++ b/lib/grn_expr.h @@ -83,6 +83,12 @@ void grn_p_expr_code(grn_ctx *ctx, grn_expr_code *code); grn_obj *grn_expr_alloc_const(grn_ctx *ctx, grn_obj *expr); +grn_bool grn_expr_is_simple_function_call(grn_ctx *ctx, grn_obj *expr); +grn_obj *grn_expr_simple_function_call_get_function(grn_ctx *ctx, grn_obj *expr); +grn_rc grn_expr_simple_function_call_get_arguments(grn_ctx *ctx, + grn_obj *expr, + grn_obj *arguments); + #ifdef __cplusplus } #endif