Skip to content

Commit

Permalink
Support types by NFKC
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Feb 1, 2013
1 parent 56e9a40 commit f397268
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 2 deletions.
25 changes: 23 additions & 2 deletions normalizers/mysql.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@

#include <groonga/normalizer.h>
#include <groonga/tokenizer.h>
#include <groonga/nfkc.h>

#include <stdint.h>

Expand Down Expand Up @@ -1614,9 +1615,12 @@ normalize(grn_ctx *ctx, grn_obj *string)
{
const char *original, *rest;
unsigned int original_length_in_bytes, rest_length;
unsigned int initial_data_size;
char *normalized;
unsigned int normalized_length_in_bytes = 0;
unsigned int normalized_n_characters = 0;
unsigned char *types = NULL;
unsigned char *current_type = NULL;
grn_encoding encoding;
int flags;
grn_bool remove_blank_p;
Expand All @@ -1625,7 +1629,13 @@ normalize(grn_ctx *ctx, grn_obj *string)
flags = grn_string_get_flags(ctx, string);
remove_blank_p = flags & GRN_STRING_REMOVE_BLANK;
grn_string_get_original(ctx, string, &original, &original_length_in_bytes);
normalized = GRN_PLUGIN_MALLOC(ctx, original_length_in_bytes + 1);
/* Whey 3? It is derived from utf8_normalize in groonga/lib/normalizer.c. */
initial_data_size = original_length_in_bytes * 3;
normalized = GRN_PLUGIN_MALLOC(ctx, initial_data_size + 1);
if (flags & GRN_STRING_WITH_TYPES) {
types = GRN_PLUGIN_MALLOC(ctx, initial_data_size + 1);
current_type = types;
}
rest = original;
rest_length = original_length_in_bytes;
while (rest_length > 0) {
Expand All @@ -1640,7 +1650,9 @@ normalize(grn_ctx *ctx, grn_obj *string)

decompose_character(rest, character_length, &plane, &low_code);
if (remove_blank_p && character_length == 1 && rest[0] == ' ') {
/* TODO: set GRN_CHAR_BLANK */
if (current_type > types) {
current_type[-1] |= GRN_CHAR_BLANK;
}
} else {
if (plane >= 0x00 && mysql_unicode_normalize_table[plane]) {
uint32_t normalized_code;
Expand All @@ -1656,6 +1668,14 @@ normalize(grn_ctx *ctx, grn_obj *string)
}
normalized_length_in_bytes += character_length;
}
if (current_type) {
char *current_normalized;
current_normalized =
normalized + normalized_length_in_bytes - character_length;
current_type[0] =
grn_nfkc_char_type((unsigned char *)current_normalized);
current_type++;
}
normalized_n_characters++;
}

Expand All @@ -1669,6 +1689,7 @@ normalize(grn_ctx *ctx, grn_obj *string)
normalized,
normalized_length_in_bytes,
normalized_n_characters);
grn_string_set_types(ctx, string, types);
} else {
/* TODO: report error */
GRN_PLUGIN_FREE(ctx, normalized);
Expand Down
23 changes: 23 additions & 0 deletions test/suite/with_types.expected
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
register normalizers/mysql
[[0,0.0,0.0],true]
normalize NormalizerMySQLGeneralCI "a1!あア亜💕 " WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "A1!あア亜💕 ",
"types": [
"alpha",
"digit",
"symbol",
"hiragana",
"katakana",
"kanji",
"others",
"others"
]
}
]
3 changes: 3 additions & 0 deletions test/suite/with_types.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
register normalizers/mysql

normalize NormalizerMySQLGeneralCI "a1!あア亜💕 " WITH_TYPES

0 comments on commit f397268

Please sign in to comment.