Navigation Menu

Skip to content

Commit

Permalink
ii regexp: support multiple ".*" in one pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 13, 2017
1 parent a9f2dee commit 9d06ca0
Show file tree
Hide file tree
Showing 9 changed files with 403 additions and 47 deletions.
109 changes: 62 additions & 47 deletions lib/ii.c
Expand Up @@ -7712,9 +7712,11 @@ typedef struct {
const char *string;
unsigned int string_len;
grn_bool done;
grn_ii_select_cursor_posting unshifted_posting;
grn_bool have_unshifted_posting;
} grn_ii_select_cursor;

grn_rc
static grn_rc
grn_ii_select_cursor_close(grn_ctx *ctx,
grn_ii_select_cursor *cursor)
{
Expand All @@ -7738,7 +7740,7 @@ grn_ii_select_cursor_close(grn_ctx *ctx,
return GRN_SUCCESS;
}

grn_ii_select_cursor *
static grn_ii_select_cursor *
grn_ii_select_cursor_open(grn_ctx *ctx,
grn_ii *ii,
const char *string,
Expand Down Expand Up @@ -7845,10 +7847,12 @@ grn_ii_select_cursor_open(grn_ctx *ctx,

cursor->done = GRN_FALSE;

cursor->have_unshifted_posting = GRN_FALSE;

return cursor;
}

grn_ii_select_cursor_posting *
static grn_ii_select_cursor_posting *
grn_ii_select_cursor_next(grn_ctx *ctx,
grn_ii_select_cursor *cursor)
{
Expand All @@ -7859,6 +7863,11 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
int max_interval = cursor->max_interval;
grn_operator mode = cursor->mode;

if (cursor->have_unshifted_posting) {
cursor->have_unshifted_posting = GRN_FALSE;
return &(cursor->unshifted_posting);
}

if (cursor->done) {
return NULL;
}
Expand All @@ -7885,29 +7894,48 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
}

if (tip == tie) {
int n_occurs = 0;
int start_pos = 0;
int pos = 0;
int end_pos = 0;
int score = 0;
int tf = 0;
int tscore = 0;

#define SKIP_OR_BREAK(pos) {\
if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \
if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \
if (ti->p->rid != rid || ti->p->sid != sid) { \
next_rid = ti->p->rid; \
next_sid = ti->p->sid; \
break; \
} \
}

#define RETURN_POSTING() do { \
cursor->posting.rid = rid; \
cursor->posting.sid = sid; \
cursor->posting.start_pos = start_pos; \
cursor->posting.end_pos = end_pos; \
cursor->posting.tf = tf; \
cursor->posting.weight = tscore; \
if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \
if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \
cursor->done = GRN_TRUE; \
} \
} \
return &(cursor->posting); \
} while (GRN_FALSE)

if (n_tis == 1) {
n_occurs = (*tis)->p->tf;
start_pos = pos = end_pos = (*tis)->p->pos;
pos++;
tf = (*tis)->p->tf;
tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight;
start_pos = end_pos = (*tis)->p->pos;
RETURN_POSTING();
} else if (mode == GRN_OP_NEAR) {
bt_zap(bt);
for (tip = tis; tip < tie; tip++) {
token_info *ti = *tip;
SKIP_OR_BREAK(end_pos);
SKIP_OR_BREAK(pos);
bt_push(bt, ti);
}
if (tip == tie) {
Expand Down Expand Up @@ -7937,7 +7965,8 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
return NULL;
}
if ((max_interval < 0) || (max - min <= max_interval)) {
n_occurs++;
/* TODO: Set start_pos, pos, end_pos, tf and tscore */
RETURN_POSTING();
if (ti->pos == max + 1) {
break;
}
Expand All @@ -7958,41 +7987,27 @@ grn_ii_select_cursor_next(grn_ctx *ctx,

if (tip == tie) { tip = tis; }
ti = *tip;
SKIP_OR_BREAK(end_pos);
if (ti->pos == end_pos) {
SKIP_OR_BREAK(pos);
if (ti->pos == pos) {
score += ti->p->weight + ti->cursors->bins[0]->weight;
count++;
} else {
score = ti->p->weight + ti->cursors->bins[0]->weight;
count = 1;
if (start_pos == 0) {
start_pos = ti->pos;
}
end_pos = ti->pos;
start_pos = pos = ti->pos;
end_pos = ti->p->pos;
}
if (count == n_tis) {
pos++;
if (ti->p->pos > end_pos) {
end_pos = ti->p->pos;
}
tf = 1;
tscore += score;
score = 0;
count = 0;
end_pos++;
n_occurs++;
RETURN_POSTING();
}
}
}
if (n_occurs > 0) {
cursor->posting.rid = rid;
cursor->posting.sid = sid;
cursor->posting.start_pos = start_pos;
cursor->posting.end_pos = end_pos;
cursor->posting.tf = n_occurs;
cursor->posting.weight = tscore;
if (token_info_skip_pos(ctx, *tis, rid, sid, end_pos + 1) != GRN_SUCCESS) {
if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) {
cursor->done = GRN_TRUE;
}
}
return &(cursor->posting);
}
#undef SKIP_OR_BREAK
}
if (token_info_skip(ctx, *tis, next_rid, next_sid)) {
Expand All @@ -8001,6 +8016,15 @@ grn_ii_select_cursor_next(grn_ctx *ctx,
}
}

static void
grn_ii_select_cursor_unshift(grn_ctx *ctx,
grn_ii_select_cursor *cursor,
grn_ii_select_cursor_posting *posting)
{
cursor->unshifted_posting = *posting;
cursor->have_unshifted_posting = GRN_TRUE;
}

static grn_rc
grn_ii_parse_regexp_query(grn_ctx *ctx,
const char *log_tag,
Expand Down Expand Up @@ -8131,8 +8155,6 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
int i;
grn_ii_select_cursor **cursors;
grn_bool have_error = GRN_FALSE;
int keep_i = 0;
grn_ii_select_cursor_posting keep_posting;

cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings);
for (i = 0; i < n_parsed_strings; i++) {
Expand Down Expand Up @@ -8169,26 +8191,19 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
grn_ii_select_cursor_posting *posting_i;

for (;;) {
if (keep_i == i) {
posting_i = &keep_posting;
keep_i = 0;
} else {
posting_i = grn_ii_select_cursor_next(ctx, cursors[i]);
if (!posting_i) {
break;
}
posting_i = grn_ii_select_cursor_next(ctx, cursors[i]);
if (!posting_i) {
break;
}

if (posting_i->rid == posting->rid &&
posting_i->sid == posting->sid &&
posting_i->start_pos > pos) {
keep_i = i;
keep_posting = *posting_i;
grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i);
break;
}
if (posting_i->rid > posting->rid) {
keep_i = i;
keep_posting = *posting_i;
grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i);
break;
}
}
Expand Down
@@ -0,0 +1,60 @@
table_create Properties TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Properties content COLUMN_SCALAR ShortText
[[0,0.0,0.0],true]
table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content
[[0,0.0,0.0],true]
load --table Properties
[
{"content": "app:Groonga"},
{"content": "app:apple"},
{"content": "project:Groonga"},
{"content": "appname:application1"}
]
[[0,0.0,0.0],4]
log_level --level info
[[0,0.0,0.0],true]
select Properties --filter 'content @~ "app.*:.*pp.*"' --output_columns content,_score
[
[
0,
0.0,
0.0
],
[
[
[
2
],
[
[
"content",
"ShortText"
],
[
"_score",
"Int32"
]
],
[
"app:apple",
1
],
[
"appname:application1",
1
]
]
]
]
#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
#|i| grn_ii_sel > (app.*:.*pp.*)
#|i| [ii][select][cursor][open] n=2 <app>
#|i| [ii][select][cursor][open] n=1 <:>
#|i| [ii][select][cursor][open] n=1 <pp>
#|i| exact: 2
#|i| hits=2
log_level --level notice
[[0,0.0,0.0],true]
@@ -0,0 +1,26 @@
#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes

table_create Properties TABLE_NO_KEY
column_create Properties content COLUMN_SCALAR ShortText

table_create RegexpTokens TABLE_PAT_KEY ShortText \
--normalizer NormalizerAuto \
--default_tokenizer TokenRegexp
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
Properties content

load --table Properties
[
{"content": "app:Groonga"},
{"content": "app:apple"},
{"content": "project:Groonga"},
{"content": "appname:application1"}
]

log_level --level info
#@add-important-log-levels info
select Properties \
--filter 'content @~ "app.*:.*pp.*"' \
--output_columns content,_score
#@remove-important-log-levels info
log_level --level notice
@@ -0,0 +1,55 @@
table_create Properties TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Properties content COLUMN_SCALAR ShortText
[[0,0.0,0.0],true]
table_create RegexpTokens TABLE_PAT_KEY ShortText --normalizer NormalizerAuto --default_tokenizer TokenRegexp
[[0,0.0,0.0],true]
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION Properties content
[[0,0.0,0.0],true]
load --table Properties
[
{"content": "app:Groonga"},
{"content": "app:apple"},
{"content": "project:app:apple"},
{"content": "appname:application1"}
]
[[0,0.0,0.0],4]
log_level --level info
[[0,0.0,0.0],true]
select Properties --filter 'content @~ "\\\\Aapp:.*pp.*"' --output_columns content,_score
[
[
0,
0.0,
0.0
],
[
[
[
1
],
[
[
"content",
"ShortText"
],
[
"_score",
"Int32"
]
],
[
"app:apple",
1
]
]
]
]
#|i| [object][search][index][key][regexp] <RegexpTokens.memos_content>
#|i| grn_ii_sel > (\Aapp:.*pp.*)
#|i| [ii][select][cursor][open] n=3 <￯app:>
#|i| [ii][select][cursor][open] n=1 <pp>
#|i| exact: 1
#|i| hits=1
log_level --level notice
[[0,0.0,0.0],true]
@@ -0,0 +1,26 @@
#$GRN_II_REGEXP_DOT_ASTERISK_ENABLE=yes

table_create Properties TABLE_NO_KEY
column_create Properties content COLUMN_SCALAR ShortText

table_create RegexpTokens TABLE_PAT_KEY ShortText \
--normalizer NormalizerAuto \
--default_tokenizer TokenRegexp
column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
Properties content

load --table Properties
[
{"content": "app:Groonga"},
{"content": "app:apple"},
{"content": "project:app:apple"},
{"content": "appname:application1"}
]

log_level --level info
#@add-important-log-levels info
select Properties \
--filter 'content @~ "\\\\Aapp:.*pp.*"' \
--output_columns content,_score
#@remove-important-log-levels info
log_level --level notice

0 comments on commit 9d06ca0

Please sign in to comment.