Skip to content
Browse files

* lib/token.c: "<token:mecab>" add.

  • Loading branch information...
1 parent ec34a34 commit ff8fca2a94f4285e0877fa327a51f49e1ab88e07 @daijiro daijiro committed
Showing with 156 additions and 312 deletions.
  1. +1 −1 AUTHORS
  2. +4 −0 ChangeLog
  3. +1 −0 groonga.h
  4. +4 −5 lib/db.h
  5. +146 −293 lib/token.c
  6. +0 −13 lib/token.h
View
2 AUTHORS
@@ -4,6 +4,7 @@ Tasuku SUENAGA <a at razil. jp>
Yutaro Shimamura <yu at razil. jp>
Kouhei Sutou <kou at cozmixng. org>
Kazuho Oku <kazuhooku at gmail. com>
+Moriyoshi Koizumi <moriyoshi at gmail. com>
Patches and modules from:
Daisuke Maki <dmaki at cpan. org>
@@ -12,6 +13,5 @@ Hiroyuki OYAMA <oyama at module. jp>
Nguyen Anh Phu <phuna at users. sourceforge. net>
Hideyuki KUROSU <hideyuki. kurosu at gmail. com>
Takuo Kitame <kitame at valinux. co. jp>
-Moriyoshi Koizumi <koizumi at gree. co. jp>
Yoshihiro Oyama <yos-o at smilemark. com>
cZfSunOs.U <sunos at saita. ma>
View
4 ChangeLog
@@ -1,5 +1,9 @@
2009-01-26 Poe MORITA <morita at razil.jp>
+ * lib/token.c: "<token:mecab>" add.
+
+2009-01-26 Poe MORITA <morita at razil.jp>
+
* lib/ql.h (grn_ql_def_native_func): return grn_cell pointer.
2009-01-24 Poe MORITA <morita at razil.jp>
View
1 groonga.h
@@ -81,6 +81,7 @@ typedef enum {
GRN_CONNECTION_REFUSED = -50,
GRN_RESOURCE_BUSY = -51,
GRN_RANGE_ERROR = -52,
+ GRN_TOKENIZER_ERROR = -53,
grn_invalid_format,
grn_file_operation_error,
grn_other_error,
View
9 lib/db.h
@@ -49,10 +49,6 @@ typedef struct {
#define GRN_RSET_SUBRECS_COPY(subrecs,size,n,src) \
(memcpy(GRN_RSET_SUBRECS_NTH(subrecs, size, n), src, size + GRN_RSET_SCORE_SIZE))
-
-
-#define GRN_TABLE_DEFAULT_NGRAM_UNIT_SIZE 2
-
typedef struct _grn_db grn_db;
typedef struct _grn_proc grn_proc;
typedef struct _grn_table_delete_optarg grn_table_delete_optarg;
@@ -216,7 +212,10 @@ enum {
GRN_DB_SHORTTEXT,
GRN_DB_TEXT,
GRN_DB_LONGTEXT,
- GRN_DB_BIGRAM
+ GRN_DB_UNIGRAM,
+ GRN_DB_BIGRAM,
+ GRN_DB_TRIGRAM,
+ GRN_DB_MECAB,
};
#ifdef __cplusplus
View
439 lib/token.c
@@ -21,345 +21,187 @@
#include "pat.h"
#include "hash.h"
-/* ngram */
+/*
inline static grn_token *
-grn_ngram_init(grn_token *token)
+grn_delimited_init(grn_token *token)
{
+ int cl;
+ const char *p = token->nstr->norm;
+ const char *pe = token->nstr->norm + token->nstr->norm_blen;
+ token->orig = (unsigned char *)p;
+ while ((cl = grn_isspace((const char *)p, token->encoding))) {
+ p += cl;
+ if (pe <= p) {
+ token->status = grn_token_done;
+ break;
+ }
+ }
+ token->next = (unsigned char *)p;
return token;
}
inline static grn_id
-grn_ngram_next(grn_token *token)
+grn_delimited_next(grn_token *token)
{
grn_id tid;
grn_obj *table = token->table;
grn_ctx *ctx = token->ctx;
- uint_least8_t *cp = NULL;
- int32_t len = 0, pos;
- const unsigned char *p, *q, *r;
+ int32_t len, offset = token->offset + token->len;
+ const unsigned char *p;
if (token->status == grn_token_done) { return GRN_ID_NIL; }
- token->force_prefix = 0;
- for (p = token->next, pos = token->pos + token->skip; *p; p = r, pos++) {
- if (token->nstr->ctypes) { cp = token->nstr->ctypes + pos; }
- if (token->uni_alpha && GRN_NSTR_CTYPE(*cp) == grn_str_alpha) {
- for (len = 1, r = p;;len++) {
- size_t cl;
- if (!(cl = grn_str_charlen(ctx, (char *)r, token->encoding))) { break; }
- r += cl;
- if (GRN_NSTR_ISBLANK(*cp)) { break; }
- if (GRN_NSTR_CTYPE(*++cp) != grn_str_alpha) { break; }
- }
- {
- size_t blen = r - p;
- if (!blen) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = p;
- token->curr_size = blen;
- tid = grn_table_lookup(ctx, table, p, blen, &token->flags);
- token->skip = len;
- }
- } else if (token->uni_digit && GRN_NSTR_CTYPE(*cp) == grn_str_digit) {
- for (len = 1, r = p;;len++) {
- size_t cl;
- if (!(cl = grn_str_charlen(ctx, (char *)r, token->encoding))) { break; }
- r += cl;
- if (GRN_NSTR_ISBLANK(*cp)) { break; }
- if (GRN_NSTR_CTYPE(*++cp) != grn_str_digit) { break; }
- }
- {
- size_t blen = r - p;
- if (!blen) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = p;
- token->curr_size = (uint32_t)blen;
- tid = grn_table_lookup(ctx, table, p, blen, &token->flags);
- token->skip = len;
- }
- } else if (token->uni_symbol && GRN_NSTR_CTYPE(*cp) == grn_str_symbol) {
- for (len = 1, r = p;;len++) {
- size_t cl;
- if (!(cl = grn_str_charlen(ctx, (char *)r, token->encoding))) { break; }
- r += cl;
- if (GRN_NSTR_ISBLANK(*cp)) { break; }
- if (GRN_NSTR_CTYPE(*++cp) != grn_str_symbol) { break; }
- }
- {
- size_t blen = r - p;
- if (!blen) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = p;
- token->curr_size = (uint32_t)blen;
- tid = grn_table_lookup(ctx, table, p, blen, &token->flags);
- token->skip = len;
- }
- } else {
- size_t cl;
-#ifdef PRE_DEFINED_UNSPLIT_WORDS
- {
- const unsigned char *key = NULL;
- // todo : grn_pat_lcp_search
- if ((tid = grn_sym_common_prefix_search(sym, p))) {
- if (!(key = _grn_sym_key(sym, tid))) {
- token->status = grn_token_not_found;
- return GRN_ID_NIL;
- }
- len = grn_str_len(key, token->encoding, NULL);
- }
- r = p + grn_str_charlen(ctx, p, token->encoding);
- if (tid && (len > 1 || r == p)) {
- if (r != p && pos + len - 1 <= token->tail) { continue; }
- p += strlen(key);
- if (!*p && !(token->flags & GRN_TABLE_ADD)) { token->status = grn_token_done; }
- }
- }
-#endif /* PRE_DEFINED_UNSPLIT_WORDS */
- if (!(cl = grn_str_charlen(ctx, (char *)p, token->encoding))) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- r = p + cl;
- {
- int blankp = 0;
- for (len = 1, q = r; len < token->ngram_unit; len++) {
- if (cp) {
- if (GRN_NSTR_ISBLANK(*cp)) { blankp++; break; }
- cp++;
- }
- if (!(cl = grn_str_charlen(ctx, (char *)q, token->encoding)) ||
- (token->uni_alpha && GRN_NSTR_CTYPE(*cp) == grn_str_alpha) ||
- (token->uni_digit && GRN_NSTR_CTYPE(*cp) == grn_str_digit) ||
- (token->uni_symbol && GRN_NSTR_CTYPE(*cp) == grn_str_symbol)) {
- break;
- }
- q += cl;
- }
- if (blankp && !(token->flags & GRN_TABLE_ADD)) { continue; }
- }
- if ((!cl || !*q) && !(token->flags & GRN_TABLE_ADD)) { token->status = grn_token_done; }
- if (len < token->ngram_unit) { token->force_prefix = 1; }
- {
- size_t blen = q - p;
- if (!blen) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = p;
- token->curr_size = (uint32_t)blen;
- tid = grn_table_lookup(ctx, table, p, blen, &token->flags);
- token->skip = 1;
- }
+ for (p = token->next, len = 0;;) {
+ size_t cl;
+ if (!(cl = grn_str_charlen(ctx, (char *)p, token->encoding)) ||
+ grn_isspace((const char *)p, token->encoding)) {
+ break;
}
- token->pos = pos;
+ p += cl;
+ len++;
+ }
+ if (!len) {
+ token->status = grn_token_done;
+ return GRN_ID_NIL;
+ }
+ token->curr = token->next;
+ token->curr_size = (uint32_t)(p - token->next);
+ tid = grn_table_lookup(ctx, table, token->curr, token->curr_size, &token->flags);
+ {
+ int cl;
+ while ((cl = grn_isspace((const char *)p, token->encoding))) { p += cl; }
+ token->next = p;
+ token->offset = offset;
token->len = len;
- token->tail = pos + len - 1;
- token->next = r;
- // printf("tid=%d pos=%d tail=%d (%s) %s\n", tid, token->pos, token->tail, _grn_sym_key(sym, tid), r);
- // printf("tid=%d pos=%d tail=%d (%s)\n", tid, token->pos, token->tail, _grn_sym_key(sym, tid));
- if (!tid) {
- token->status = grn_token_not_found;
- } else {
- if (!*r) { token->status = grn_token_done; }
- }
- return tid;
}
- token->status = grn_token_done;
- return GRN_ID_NIL;
+ if (tid == GRN_ID_NIL) {
+ token->status = grn_token_not_found;
+ } else {
+ if (!*p) { token->status = grn_token_done; }
+ }
+ token->pos++;
+ return tid;
}
+ delimited */
-/* mecab */
+/**** new tokenizer ****/
+
+/* mecab tokenizer */
#ifndef NO_MECAB
static mecab_t *sole_mecab;
static grn_mutex sole_mecab_lock;
-static char *grn_token_default_mecab_argv[] = {"", "-Owakati"};
-
-static int grn_token_mecab_argc = 2;
-static char **grn_token_mecab_argv = grn_token_default_mecab_argv;
-
#define SOLE_MECAB_CONFIRM do {\
if (!sole_mecab) {\
+ static char *argv[] = {"", "-Owakati"};\
MUTEX_LOCK(sole_mecab_lock);\
- if (!sole_mecab) { sole_mecab = mecab_new(grn_token_mecab_argc, grn_token_mecab_argv); }\
+ if (!sole_mecab) { sole_mecab = mecab_new(2, argv); }\
MUTEX_UNLOCK(sole_mecab_lock);\
}\
} while(0)
-inline static grn_token *
-grn_mecab_init(grn_token *token)
+typedef struct {
+ mecab_t *mecab;
+ unsigned char *buf;
+ unsigned char *next;
+ unsigned char *end;
+ grn_encoding encoding;
+} grn_mecab_tokenizer;
+
+static grn_rc
+mecab_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
{
- grn_ctx *ctx = token->ctx;
- unsigned int bufsize, maxtrial = 10, len;
- grn_nstr *nstr = token->nstr;
char *buf, *s, *p;
char mecab_err[256];
- // grn_log("(%s)", str);
+ grn_mecab_tokenizer *token;
+ unsigned int bufsize, maxtrial = 10, len;
SOLE_MECAB_CONFIRM;
if (!sole_mecab) {
GRN_LOG(grn_log_alert, "mecab_new failed on grn_mecab_init");
- return NULL;
+ return GRN_TOKENIZER_ERROR;
}
+ if (!(token = GRN_MALLOC(sizeof(grn_mecab_tokenizer)))) { return ctx->rc; }
+ user_data->ptr = token;
token->mecab = sole_mecab;
// if (!(token->mecab = mecab_new3())) {
- len = nstr->norm_blen;
+ len = argv[1].int_value;
mecab_err[sizeof(mecab_err) - 1] = '\0';
for (bufsize = len * 2 + 1; maxtrial; bufsize *= 2, maxtrial--) {
if(!(buf = GRN_MALLOC(bufsize + 1))) {
- GRN_LOG(grn_log_alert, "buffer allocation on grn_mecab_init failed !");
+ GRN_LOG(grn_log_alert, "buffer allocation on mecab_init failed !");
GRN_FREE(token);
- return NULL;
+ return ctx->rc;
}
MUTEX_LOCK(sole_mecab_lock);
- s = mecab_sparse_tostr3(token->mecab, (char *)nstr->norm, len, buf, bufsize);
+ s = mecab_sparse_tostr3(token->mecab, (char *)argv[0].ptr, len, buf, bufsize);
if (!s) {
strncpy(mecab_err, mecab_strerror(token->mecab), sizeof(mecab_err) - 1);
}
MUTEX_UNLOCK(sole_mecab_lock);
if (s) { break; }
GRN_FREE(buf);
- if (strstr(mecab_err, "output buffer overflow") == NULL) {
- break;
- }
+ if (strstr(mecab_err, "output buffer overflow") == NULL) { break; }
}
if (!s) {
- GRN_LOG(grn_log_alert, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s", len, bufsize, mecab_err);
- grn_token_close(token);
- return NULL;
+ GRN_LOG(grn_log_alert, "mecab_sparse_tostr failed len=%d bufsize=%d err=%s",
+ len, bufsize, mecab_err);
+ GRN_FREE(token);
+ return GRN_TOKENIZER_ERROR;
}
// certain version of mecab returns trailing lf or spaces.
- for (p = buf + strlen(buf) - 1; buf <= p && (*p == '\n' || isspace(*(unsigned char *)p)); p--) { *p = '\0'; }
+ for (p = buf + strlen(buf) - 1;
+ buf <= p && (*p == '\n' || isspace(*(unsigned char *)p));
+ p--) { *p = '\0'; }
//grn_log("sparsed='%s'", s);
- token->orig = (unsigned char *)nstr->norm;
token->buf = (unsigned char *)buf;
token->next = (unsigned char *)buf;
- token->force_prefix = 0;
- return token;
+ token->end = (unsigned char *)buf + strlen(buf);
+ grn_table_get_info(ctx, table, NULL, &token->encoding, NULL);
+ return GRN_SUCCESS;
}
-inline static grn_id
-grn_mecab_next(grn_token *token)
+static grn_rc
+mecab_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
{
- grn_id tid;
- grn_obj *table = token->table;
- grn_ctx *ctx = token->ctx;
- int32_t len, offset = token->offset + token->len;
- const unsigned char *p;
- if (token->status == grn_token_done) { return GRN_ID_NIL; }
- for (p = token->next, len = 0;;) {
- size_t cl;
- if (!(cl = grn_str_charlen(ctx, (char *)p, token->encoding)) ||
- grn_isspace((const char *)p, token->encoding)) {
+ size_t cl;
+ grn_mecab_tokenizer *token = user_data->ptr;
+ const unsigned char *p = token->next, *r;
+ const unsigned char *e = token->end;
+ for (r = p; r < e; r += cl) {
+ if (!(cl = grn_str_charlen_nonnull(ctx, (char *)r, (char *)e, token->encoding))) {
+ token->next = (unsigned char *)r;
+ break;
+ }
+ if (grn_isspace((const char *)r, token->encoding)) {
+ const unsigned char *q = r;
+ while ((cl = grn_isspace((const char *)q, token->encoding))) { q += cl; }
+ token->next = (unsigned char *)q;
break;
}
- p += cl;
- len++;
- }
- if (!len) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = token->next;
- token->curr_size = (uint32_t)(p - token->next);
- tid = grn_table_lookup(ctx, table, token->curr, token->curr_size, &token->flags);
- {
- int cl;
- while ((cl = grn_isspace((const char *)p, token->encoding))) { p += cl; }
- token->next = p;
- token->offset = offset;
- token->len = len;
- }
- if (tid == GRN_ID_NIL) {
- token->status = grn_token_not_found;
- } else {
- if (!*p) { token->status = grn_token_done; }
}
- token->pos++;
- return tid;
+ argv[0].ptr = (void *)p;
+ argv[1].int_value = r - p;
+ argv[2].int_value = r == e ? GRN_TOKEN_LAST : 0;
+ return GRN_SUCCESS;
}
-grn_rc
-grn_token_set_mecab_args(grn_ctx *ctx, int argc, char **argv)
+static grn_rc
+mecab_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
{
- grn_token_mecab_argc = argc;
- grn_token_mecab_argv = argv;
- if (sole_mecab) {
- GRN_LOG(grn_log_alert, "mecab already initialized");
- return GRN_INVALID_ARGUMENT;
- }
- SOLE_MECAB_CONFIRM;
+ grn_mecab_tokenizer *token = user_data->ptr;
+ // if (token->mecab) { mecab_destroy(token->mecab); }
+ GRN_FREE(token->buf);
+ GRN_FREE(token);
return GRN_SUCCESS;
}
#endif /* NO_MECAB */
-/* delimited */
-
-inline static grn_token *
-grn_delimited_init(grn_token *token)
-{
- int cl;
- const char *p = token->nstr->norm;
- const char *pe = token->nstr->norm + token->nstr->norm_blen;
- token->orig = (unsigned char *)p;
- while ((cl = grn_isspace((const char *)p, token->encoding))) {
- p += cl;
- if (pe <= p) {
- token->status = grn_token_done;
- break;
- }
- }
- token->next = (unsigned char *)p;
- return token;
-}
-
-inline static grn_id
-grn_delimited_next(grn_token *token)
-{
- grn_id tid;
- grn_obj *table = token->table;
- grn_ctx *ctx = token->ctx;
- int32_t len, offset = token->offset + token->len;
- const unsigned char *p;
- if (token->status == grn_token_done) { return GRN_ID_NIL; }
- for (p = token->next, len = 0;;) {
- size_t cl;
- if (!(cl = grn_str_charlen(ctx, (char *)p, token->encoding)) ||
- grn_isspace((const char *)p, token->encoding)) {
- break;
- }
- p += cl;
- len++;
- }
- if (!len) {
- token->status = grn_token_done;
- return GRN_ID_NIL;
- }
- token->curr = token->next;
- token->curr_size = (uint32_t)(p - token->next);
- tid = grn_table_lookup(ctx, table, token->curr, token->curr_size, &token->flags);
- {
- int cl;
- while ((cl = grn_isspace((const char *)p, token->encoding))) { p += cl; }
- token->next = p;
- token->offset = offset;
- token->len = len;
- }
- if (tid == GRN_ID_NIL) {
- token->status = grn_token_not_found;
- } else {
- if (!*p) { token->status = grn_token_done; }
- }
- token->pos++;
- return tid;
-}
+/* ngram tokenizer */
typedef struct {
uint8_t uni_alpha;
@@ -378,8 +220,8 @@ typedef struct {
} grn_ngram_tokenizer;
static grn_rc
-bigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
- int argc, grn_proc_data *argv)
+ngram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv, uint8_t ngram_unit)
{
grn_ngram_tokenizer *token;
if (!(token = GRN_MALLOC(sizeof(grn_ngram_tokenizer)))) { return ctx->rc; }
@@ -387,7 +229,7 @@ bigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
token->uni_alpha = 1;
token->uni_digit = 1;
token->uni_symbol = 1;
- token->ngram_unit = GRN_TABLE_DEFAULT_NGRAM_UNIT_SIZE;
+ token->ngram_unit = ngram_unit;
token->overlap = 0;
token->pos = 0;
token->skip = 0;
@@ -400,7 +242,22 @@ bigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
}
static grn_rc
-bigram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+unigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
+{ return ngram_init(ctx, table, user_data, argc, argv, 1); }
+
+static grn_rc
+bigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
+{ return ngram_init(ctx, table, user_data, argc, argv, 2); }
+
+static grn_rc
+trigram_init(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ int argc, grn_proc_data *argv)
+{ return ngram_init(ctx, table, user_data, argc, argv, 3); }
+
+static grn_rc
+ngram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
int argc, grn_proc_data *argv)
{
size_t cl;
@@ -446,7 +303,7 @@ bigram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
}
len = grn_str_len(key, token->encoding, NULL);
}
- r = p + grn_str_charlen(ctx, p, token->encoding);
+ r = p + grn_str_charlen_nonnull(ctx, p, e, token->encoding);
if (tid && (len > 1 || r == p)) {
if (r != p && pos + len - 1 <= token->tail) { continue; }
p += strlen(key);
@@ -490,7 +347,7 @@ bigram_next(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
}
static grn_rc
-bigram_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
+ngram_fin(grn_ctx *ctx, grn_obj *table, grn_proc_data *user_data,
int argc, grn_proc_data *argv)
{
GRN_FREE(user_data->ptr);
@@ -557,27 +414,13 @@ grn_token_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len,
token->table_flags = table_flags;
token->encoding = encoding;
token->tokenizer = tokenizer;
-
-#ifndef NO_MECAB
- token->mecab = NULL;
-#endif /* NO_MECAB */
- token->buf = NULL;
token->curr = NULL;
token->curr_size = 0;
token->pos = -1;
- token->skip = 1;
- token->tail = 0;
token->status = grn_token_doing;
token->orig = (unsigned char *)nstr->norm;
token->orig_blen = nstr->norm_blen;
- token->next = (unsigned char *)nstr->norm;
- token->uni_alpha = (nstr->ctypes && !(table_flags & GRN_OBJ_KEY_SPLIT_ALPHA));
- token->uni_digit = (nstr->ctypes && !(table_flags & GRN_OBJ_KEY_SPLIT_DIGIT));
- token->uni_symbol = (nstr->ctypes && !(table_flags & GRN_OBJ_KEY_SPLIT_SYMBOL));
token->force_prefix = 0;
- token->ngram_unit = GRN_TABLE_DEFAULT_NGRAM_UNIT_SIZE;
- token->offset = 0;
- token->len = 0;
token->pctx.user_data.ptr = NULL;
token->pctx.obj = table;
token->pctx.hooks = NULL;
@@ -641,8 +484,9 @@ grn_token_close(grn_token *token)
if (token) {
grn_ctx *ctx = token->ctx;
if (token->nstr) { grn_nstr_close(token->nstr); }
- ((grn_proc *)token->tokenizer)->funcs[PROC_FIN](ctx, token->table, &token->pctx.user_data,
- 0, token->pctx.data);
+ ((grn_proc *)token->tokenizer)->funcs[PROC_FIN](ctx, token->table,
+ &token->pctx.user_data,
+ 0, token->pctx.data);
GRN_FREE(token);
return GRN_SUCCESS;
} else {
@@ -650,14 +494,23 @@ grn_token_close(grn_token *token)
}
}
-#define DB_OBJ(obj) ((grn_db_obj *)obj)
-
grn_rc
grn_db_init_builtin_tokenizers(grn_ctx *ctx)
{
grn_obj *obj;
+ obj = grn_proc_create(ctx, "<token:unigram>", 15, NULL, GRN_PROC_HOOK,
+ unigram_init, ngram_next, ngram_fin);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_UNIGRAM) { return grn_invalid_format; }
obj = grn_proc_create(ctx, "<token:bigram>", 14, NULL, GRN_PROC_HOOK,
- bigram_init, bigram_next, bigram_fin);
- if (!obj || DB_OBJ(obj)->id != GRN_DB_BIGRAM) { return grn_invalid_format; }
+ bigram_init, ngram_next, ngram_fin);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_BIGRAM) { return grn_invalid_format; }
+ obj = grn_proc_create(ctx, "<token:trigram>", 15, NULL, GRN_PROC_HOOK,
+ trigram_init, ngram_next, ngram_fin);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_TRIGRAM) { return grn_invalid_format; }
+#ifndef NO_MECAB
+ obj = grn_proc_create(ctx, "<token:mecab>", 13, NULL, GRN_PROC_HOOK,
+ mecab_init, mecab_next, mecab_fin);
+ if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_MECAB) { return grn_invalid_format; }
+#endif /* NO_MECAB */
return GRN_SUCCESS;
}
View
13 lib/token.h
@@ -47,28 +47,15 @@ extern "C" {
typedef struct {
grn_ctx *ctx;
grn_obj *table;
- unsigned char *buf;
const unsigned char *orig;
- const unsigned char *next;
const unsigned char *curr;
uint32_t orig_blen;
uint32_t curr_size;
grn_nstr *nstr;
-#ifndef NO_MECAB
- mecab_t *mecab;
-#endif /* NO_MECAB */
int32_t pos;
- int32_t len;
- uint32_t skip;
- uint32_t tail;
- uint32_t offset;
grn_search_flags flags;
uint8_t status;
- uint8_t uni_alpha;
- uint8_t uni_digit;
- uint8_t uni_symbol;
uint8_t force_prefix;
- uint8_t ngram_unit;
grn_obj_flags table_flags;
grn_encoding encoding;
grn_obj *tokenizer;

0 comments on commit ff8fca2

Please sign in to comment.
Something went wrong with that request. Please try again.