Navigation Menu

Skip to content

Commit

Permalink
TokenRegexp: don't search overlapped tokens
Browse files Browse the repository at this point in the history
They are needless.
  • Loading branch information
kou committed Apr 7, 2015
1 parent 81d6ec7 commit 6d994a6
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 4 deletions.
9 changes: 9 additions & 0 deletions lib/tokenizers.c
Expand Up @@ -475,6 +475,7 @@ typedef struct {
struct {
grn_bool have_begin;
grn_bool have_end;
int32_t n_skip_tokens;
} get;
grn_bool is_begin;
grn_bool is_end;
Expand Down Expand Up @@ -513,6 +514,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)

tokenizer->get.have_begin = GRN_FALSE;
tokenizer->get.have_end = GRN_FALSE;
tokenizer->get.n_skip_tokens = 0;

tokenizer->is_begin = GRN_TRUE;
tokenizer->is_end = GRN_FALSE;
Expand Down Expand Up @@ -681,6 +683,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
status |= GRN_TOKEN_FORCE_PREFIX;
}
}
} else {
if (tokenizer->get.n_skip_tokens > 0) {
tokenizer->get.n_skip_tokens--;
status |= GRN_TOKEN_SKIP;
} else {
tokenizer->get.n_skip_tokens = ngram_unit - 1;
}
}
} else {
if (tokenizer->next == end) {
Expand Down
52 changes: 52 additions & 0 deletions test/command/suite/select/filter/index/regexp/long.expected
@@ -0,0 +1,52 @@
table_create Memos TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Memos content COLUMN_SCALAR Text
[[0,0.0,0.0],true]
table_create RegexpTokens TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Memos content
[[0,0.0,0.0],true]
load --table Memos
[
{"content": "Groonga"},
{"content": "Mroonga"},
{"content": "Rroonga and Ruby"}
]
[[0,0.0,0.0],3]
select Memos --filter 'content @~ "roonga"'
[
[
0,
0.0,
0.0
],
[
[
[
3
],
[
[
"_id",
"UInt32"
],
[
"content",
"Text"
]
],
[
1,
"Groonga"
],
[
2,
"Mroonga"
],
[
3,
"Rroonga and Ruby"
]
]
]
]
16 changes: 16 additions & 0 deletions test/command/suite/select/filter/index/regexp/long.test
@@ -0,0 +1,16 @@
table_create Memos TABLE_NO_KEY
column_create Memos content COLUMN_SCALAR Text

table_create RegexpTokens TABLE_PAT_KEY ShortText \
--default_tokenizer TokenRegexp
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
Memos content

load --table Memos
[
{"content": "Groonga"},
{"content": "Mroonga"},
{"content": "Rroonga and Ruby"}
]

select Memos --filter 'content @~ "roonga"'
4 changes: 0 additions & 4 deletions test/command/suite/tokenizers/regexp/get/end/four.expected
Expand Up @@ -46,10 +46,6 @@ table_tokenize Lexicon "abcd\\z" --mode GET
"value": "ab",
"position": 0
},
{
"value": "bc",
"position": 1
},
{
"value": "cd",
"position": 2
Expand Down
98 changes: 98 additions & 0 deletions test/command/suite/tokenizers/regexp/get/long.expected
@@ -0,0 +1,98 @@
table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
table_tokenize Lexicon "abcdefghijk" --mode ADD
[
[
0,
0.0,
0.0
],
[
{
"value": "￯",
"position": 0
},
{
"value": "ab",
"position": 1
},
{
"value": "bc",
"position": 2
},
{
"value": "cd",
"position": 3
},
{
"value": "de",
"position": 4
},
{
"value": "ef",
"position": 5
},
{
"value": "fg",
"position": 6
},
{
"value": "gh",
"position": 7
},
{
"value": "hi",
"position": 8
},
{
"value": "ij",
"position": 9
},
{
"value": "jk",
"position": 10
},
{
"value": "k",
"position": 11
},
{
"value": "￰",
"position": 12
}
]
]
table_tokenize Lexicon "abcdefghijk" --mode GET
[
[
0,
0.0,
0.0
],
[
{
"value": "ab",
"position": 0
},
{
"value": "cd",
"position": 2
},
{
"value": "ef",
"position": 4
},
{
"value": "gh",
"position": 6
},
{
"value": "ij",
"position": 8
},
{
"value": "jk",
"position": 9
}
]
]
5 changes: 5 additions & 0 deletions test/command/suite/tokenizers/regexp/get/long.test
@@ -0,0 +1,5 @@
table_create Lexicon TABLE_PAT_KEY ShortText \
--default_tokenizer TokenRegexp
table_tokenize Lexicon "abcdefghijk" --mode ADD

table_tokenize Lexicon "abcdefghijk" --mode GET

0 comments on commit 6d994a6

Please sign in to comment.