Skip to content

Commit

Permalink
lib-fts: Change normalizer filter to use new truncate.
Browse files Browse the repository at this point in the history
  • Loading branch information
thuovila authored and sirainen committed Aug 22, 2016
1 parent 5fcd30a commit 35eb3a2
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
11 changes: 3 additions & 8 deletions src/lib-fts/fts-filter-normalizer-icu.c
Expand Up @@ -4,7 +4,7 @@
#include "buffer.h"
#include "str.h"
#include "unichar.h" /* unicode replacement char */
#include "fts-tokenizer-common.h"
#include "fts-filter-common.h"
#include "fts-filter-private.h"
#include "fts-language.h"

Expand All @@ -19,7 +19,6 @@ struct fts_filter_normalizer_icu {
UTransliterator *transliterator;
buffer_t *utf16_token, *trans_token;
string_t *utf8_token;
unsigned int maxlen;
};

static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter)
Expand Down Expand Up @@ -69,7 +68,7 @@ fts_filter_normalizer_icu_create(const struct fts_language *lang ATTR_UNUSED,
np->utf16_token = buffer_create_dynamic(pp, 128);
np->trans_token = buffer_create_dynamic(pp, 128);
np->utf8_token = buffer_create_dynamic(pp, 128);
np->maxlen = max_length;
np->filter.max_length = max_length;
*filter_r = &np->filter;
return 0;
}
Expand Down Expand Up @@ -101,11 +100,7 @@ fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char **token,

fts_icu_utf16_to_utf8(np->utf8_token, np->trans_token->data,
np->trans_token->used / sizeof(UChar));
if (str_len(np->utf8_token) > np->maxlen) {
size_t len = np->maxlen;
fts_tokenizer_delete_trailing_partial_char(np->utf8_token->data, &len);
str_truncate(np->utf8_token, len);
}
fts_filter_truncate_token(np->utf8_token, np->filter.max_length);
*token = str_c(np->utf8_token);
return 1;
}
Expand Down
20 changes: 20 additions & 0 deletions src/lib-fts/test-fts-filter.c
Expand Up @@ -728,6 +728,25 @@ static void test_fts_filter_normalizer_oversized(void)
test_end();
}

static void test_fts_filter_normalizer_truncation(void)
{
struct fts_filter *norm = NULL;
const char *settings[] =
{"id", "Any-Lower;", "maxlen", "10",
NULL};
const char *error = NULL;
const char *token = "abcdefghi\xC3\x85";

test_begin("fts filter normalizer token truncated mid letter");
test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL,
settings, &norm, &error) == 0);
test_assert(error == NULL);
test_assert(fts_filter_filter(norm, &token, &error) >= 0);
test_assert(strcmp(token, "abcdefghi") == 0);
fts_filter_unref(&norm);
test_end();
}

#ifdef HAVE_FTS_STEMMER
static void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
{
Expand Down Expand Up @@ -986,6 +1005,7 @@ int main(void)
test_fts_filter_normalizer_baddata,
test_fts_filter_normalizer_invalid_id,
test_fts_filter_normalizer_oversized,
test_fts_filter_normalizer_truncation,
#ifdef HAVE_FTS_STEMMER
test_fts_filter_normalizer_stopwords_stemmer_eng,
test_fts_filter_stopwords_normalizer_stemmer_no,
Expand Down

0 comments on commit 35eb3a2

Please sign in to comment.