Skip to content

Commit

Permalink
TokenRegexp: support escape
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Mar 17, 2015
1 parent 978ef6d commit 3036b42
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 10 deletions.
39 changes: 29 additions & 10 deletions lib/tokenizers.c
Expand Up @@ -481,6 +481,7 @@ typedef struct {
grn_bool is_overlapping;
const char *next;
const char *end;
grn_obj buffer;
} grn_regexp_tokenizer;

static grn_obj *
Expand Down Expand Up @@ -555,6 +556,8 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
}

GRN_TEXT_INIT(&(tokenizer->buffer), 0);

return NULL;
}

Expand All @@ -566,10 +569,13 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_regexp_tokenizer *tokenizer = user_data->ptr;
unsigned int n_characters = 0;
int ngram_unit = 2;
const char *start = tokenizer->next;
const char *current = start;
grn_obj *buffer = &(tokenizer->buffer);
const char *current = tokenizer->next;
const char *end = tokenizer->end;
grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
grn_bool escaping = GRN_FALSE;

GRN_BULK_REWIND(buffer);

if (mode == GRN_TOKEN_GET) {
if (tokenizer->get.have_begin) {
Expand Down Expand Up @@ -620,17 +626,29 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
return NULL;
}

n_characters++;
current += char_len;
tokenizer->next = current;
while (n_characters < ngram_unit) {
while (GRN_TRUE) {
if (!escaping && mode == GRN_TOKEN_GET &&
char_len == 1 && current[0] == '\\') {
current += char_len;
escaping = GRN_TRUE;
} else {
n_characters++;
GRN_TEXT_PUT(ctx, buffer, current, char_len);
current += char_len;
escaping = GRN_FALSE;
if (n_characters == 1) {
tokenizer->next = current;
}
if (n_characters == ngram_unit) {
break;
}
}

char_len = grn_charlen_(ctx, (const char *)current, (const char *)end,
tokenizer->query->encoding);
if (char_len == 0) {
break;
}
n_characters++;
current += char_len;
}

if (tokenizer->is_overlapping) {
Expand All @@ -654,8 +672,8 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)

grn_tokenizer_token_push(ctx,
&(tokenizer->token),
(const char *)start,
current - start,
GRN_TEXT_VALUE(buffer),
GRN_TEXT_LEN(buffer),
status);
return NULL;
}
Expand All @@ -669,6 +687,7 @@ regexp_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
}
grn_tokenizer_token_fin(ctx, &(tokenizer->token));
grn_tokenizer_query_close(ctx, tokenizer->query);
GRN_OBJ_FIN(ctx, &(tokenizer->buffer));
GRN_FREE(tokenizer);
return NULL;
}
Expand Down
54 changes: 54 additions & 0 deletions test/command/suite/select/filter/index/regexp/escape.expected
@@ -0,0 +1,54 @@
table_create Logs TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Logs message COLUMN_SCALAR Text
[[0,0.0,0.0],true]
table_create RegexpLexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto
[[0,0.0,0.0],true]
column_create RegexpLexicon logs_message_index COLUMN_INDEX|WITH_POSITION Logs message
[[0,0.0,0.0],true]
load --table Logs
[
{"message": "host1:[error]: No memory"},
{"message": "host1:[warning]: Remained disk space is less than 30%"},
{"message": "host1:[error]: Disk full"},
{"message": "host2:[error]: No memory"},
{"message": "host2:[info]: Shutdown"}
]
[[0,0.0,0.0],5]
select Logs --filter 'message @~ "\\\\[error\\\\]"'
[
[
0,
0.0,
0.0
],
[
[
[
3
],
[
[
"_id",
"UInt32"
],
[
"message",
"Text"
]
],
[
1,
"host1:[error]: No memory"
],
[
3,
"host1:[error]: Disk full"
],
[
4,
"host2:[error]: No memory"
]
]
]
]
19 changes: 19 additions & 0 deletions test/command/suite/select/filter/index/regexp/escape.test
@@ -0,0 +1,19 @@
table_create Logs TABLE_NO_KEY
column_create Logs message COLUMN_SCALAR Text

table_create RegexpLexicon TABLE_PAT_KEY ShortText \
--default_tokenizer TokenRegexp \
--normalizer NormalizerAuto
column_create RegexpLexicon logs_message_index \
COLUMN_INDEX|WITH_POSITION Logs message

load --table Logs
[
{"message": "host1:[error]: No memory"},
{"message": "host1:[warning]: Remained disk space is less than 30%"},
{"message": "host1:[error]: Disk full"},
{"message": "host2:[error]: No memory"},
{"message": "host2:[info]: Shutdown"}
]

select Logs --filter 'message @~ "\\\\[error\\\\]"'
30 changes: 30 additions & 0 deletions test/command/suite/tokenizers/regexp/get/escape/one.expected
@@ -0,0 +1,30 @@
table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
table_tokenize Lexicon "[e" --mode ADD
[
[
0,
0.0,
0.0
],
[
{
"value": "￯",
"position": 0
},
{
"value": "[e",
"position": 1
},
{
"value": "e",
"position": 2
},
{
"value": "￰",
"position": 3
}
]
]
table_tokenize Lexicon "\\[e" --mode GET
[[0,0.0,0.0],[{"value":"[e","position":0}]]
5 changes: 5 additions & 0 deletions test/command/suite/tokenizers/regexp/get/escape/one.test
@@ -0,0 +1,5 @@
table_create Lexicon TABLE_PAT_KEY ShortText \
--default_tokenizer TokenRegexp
table_tokenize Lexicon "[e" --mode ADD

table_tokenize Lexicon "\\[e" --mode GET
98 changes: 98 additions & 0 deletions test/command/suite/tokenizers/regexp/get/escape/two.expected
@@ -0,0 +1,98 @@
table_create Lexicon TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
table_tokenize Lexicon "c:\\server" --mode ADD
[
[
0,
0.0,
0.0
],
[
{
"value": "￯",
"position": 0
},
{
"value": "c:",
"position": 1
},
{
"value": ":\\",
"position": 2
},
{
"value": "\\s",
"position": 3
},
{
"value": "se",
"position": 4
},
{
"value": "er",
"position": 5
},
{
"value": "rv",
"position": 6
},
{
"value": "ve",
"position": 7
},
{
"value": "er",
"position": 8
},
{
"value": "r",
"position": 9
},
{
"value": "￰",
"position": 10
}
]
]
table_tokenize Lexicon "c:\\\\server" --mode GET
[
[
0,
0.0,
0.0
],
[
{
"value": "c:",
"position": 0
},
{
"value": ":\\",
"position": 1
},
{
"value": "\\s",
"position": 2
},
{
"value": "se",
"position": 3
},
{
"value": "er",
"position": 4
},
{
"value": "rv",
"position": 5
},
{
"value": "ve",
"position": 6
},
{
"value": "er",
"position": 7
}
]
]
5 changes: 5 additions & 0 deletions test/command/suite/tokenizers/regexp/get/escape/two.test
@@ -0,0 +1,5 @@
table_create Lexicon TABLE_PAT_KEY ShortText \
--default_tokenizer TokenRegexp
table_tokenize Lexicon "c:\\server" --mode ADD

table_tokenize Lexicon "c:\\\\server" --mode GET

0 comments on commit 3036b42

Please sign in to comment.