Skip to content

Commit

Permalink
lib-fts: tokenizer-generic - simple explicit prefix search logic
Browse files Browse the repository at this point in the history
Logic is that words followed by a '*' create a prefix search token.
A new token is begun immediately after that. So "foo*bar" is 2 tokens
"foo*" and "bar", when in explicit prefix search tokenisation mode.

Only active in 'simple', not 'tr29'.

Signed-off-by: Phil Carmody <phil@dovecot.fi>
  • Loading branch information
Phil Carmody authored and cmouse committed Oct 10, 2018
1 parent c59424d commit e6a1742
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 11 deletions.
3 changes: 3 additions & 0 deletions src/lib-fts/fts-common.h
Expand Up @@ -8,6 +8,9 @@
((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
#define IS_WB5A_APOSTROPHE(c) \
((c) == 0x0027 || (c) == 0x2019)
#define FTS_PREFIX_SPLAT_CHAR 0x002A /* '*' */
#define IS_PREFIX_SPLAT(c) \
((c) == FTS_PREFIX_SPLAT_CHAR)
/* The h letters are included because it is an exception in French.
A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
#define IS_ASCII_VOWEL(c) \
Expand Down
1 change: 1 addition & 0 deletions src/lib-fts/fts-tokenizer-generic-private.h
Expand Up @@ -26,6 +26,7 @@ enum letter_type {
LETTER_TYPE_SOT,
LETTER_TYPE_EOT,
LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */
LETTER_TYPE_PREFIXSPLAT, /* Dovecot '*' for glob-like explicit prefix searching */
LETTER_TYPE_OTHER /* WB14 "any" */
};

Expand Down
31 changes: 20 additions & 11 deletions src/lib-fts/fts-tokenizer-generic.c
Expand Up @@ -152,6 +152,10 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
len--;
i_assert(len > 0 && data[len-1] != '\'');
}
if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) {
len--;
i_assert(len > 0 && data[len-1] != '*');
}
} else {
fts_tokenizer_delete_trailing_partial_char(data, &len);
}
Expand All @@ -161,7 +165,6 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
t_strndup(tok->token->data, len);
buffer_set_used_size(tok->token, 0);
tok->untruncated_length = 0;
shift_prev_type(tok, LETTER_TYPE_NONE);
return len > 0;
}

Expand Down Expand Up @@ -260,19 +263,23 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
i_assert(char_size > 0);

apostrophe = IS_APOSTROPHE(c);
break_type = fts_simple_is_word_break(tok, c, apostrophe);
if (break_type != FTS_WORD_TO_WORD && break_type != FTS_STOP_TO_WORD) {
if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) &&
(tok->prev_type == LETTER_TYPE_ALETTER)) {
/* this might be a prefix-mathing query */
shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT);
} else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe))
!= FTS_WORD_TO_WORD) {
tok_append_truncated(tok, data + start, i - start);
shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0
? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE);
if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
*skip_r = i + char_size;
*skip_r = i;
if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */
*skip_r += char_size;
return 1;
}
start = i + char_size;
/* it doesn't actually matter at this point how whether
subsequent apostrophes are handled by prefix
skipping or by ignoring empty tokens - they will be
dropped in any case. */
shift_prev_type(tok, LETTER_TYPE_NONE);
if ((break_type & FTS_TO_WORD) == 0)
start = i + char_size;
} else if (apostrophe) {
/* all apostrophes require special handling */
const unsigned char apostrophe_char = '\'';
Expand All @@ -295,6 +302,7 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,

/* return the last token */
if (size == 0) {
shift_prev_type(tok, LETTER_TYPE_NONE);
if (fts_tokenizer_generic_simple_current_token(tok, token_r))
return 1;
}
Expand Down Expand Up @@ -645,7 +653,8 @@ static struct letter_fn letter_fns[] = {
{letter_single_quote}, {letter_double_quote},
{letter_midnumlet}, {letter_midletter}, {letter_midnum},
{letter_numeric}, {letter_extendnumlet}, {letter_panic},
{letter_panic}, {letter_apostrophe}, {letter_other}
{letter_panic}, {letter_apostrophe}, {letter_panic},
{letter_other}
};

/*
Expand Down

0 comments on commit e6a1742

Please sign in to comment.