Skip to content

Commit

Permalink
Add unify-kana-hyphen option
Browse files Browse the repository at this point in the history
  • Loading branch information
HashidaTKS committed Feb 27, 2023
1 parent c6107af commit a1473ea
Show file tree
Hide file tree
Showing 291 changed files with 10,729 additions and 88 deletions.
1 change: 1 addition & 0 deletions lib/grn_nfkc.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ typedef struct {
grn_bool unify_katakana_wo_sound;
grn_bool unify_katakana_di_sound;
grn_bool unify_katakana_gu_small_sounds;
grn_bool unify_kana_hyphen;
grn_bool unify_kana_prolonged_sound_mark;
grn_bool unify_katakana_trailing_o;
grn_bool unify_to_romaji;
Expand Down
7 changes: 7 additions & 0 deletions lib/nfkc.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ grn_nfkc_normalize_options_init(grn_ctx *ctx,
options->unify_katakana_wo_sound = GRN_FALSE;
options->unify_katakana_di_sound = GRN_FALSE;
options->unify_katakana_gu_small_sounds = GRN_FALSE;
options->unify_kana_hyphen = GRN_FALSE;
options->unify_kana_prolonged_sound_mark = GRN_FALSE;
options->unify_katakana_trailing_o = GRN_FALSE;
options->unify_to_romaji = GRN_FALSE;
Expand Down Expand Up @@ -236,6 +237,12 @@ grn_nfkc_normalize_options_apply(grn_ctx *ctx,
raw_options,
i,
options->unify_katakana_gu_small_sounds);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana_hyphen")) {
options->unify_kana_hyphen =
grn_vector_get_element_bool(ctx,
raw_options,
i,
options->unify_kana_hyphen);
} else if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "unify_kana_prolonged_sound_mark")) {
options->unify_kana_prolonged_sound_mark =
grn_vector_get_element_bool(ctx,
Expand Down
273 changes: 185 additions & 88 deletions lib/normalizer.c

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
normalize 'NormalizerNFKC100("unify_kana_hyphen", true, "report_source_offset", true)' "あ-ぁ-い-ぃ-う-ぅ-え-ぇ-お-ぉ-" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ああぁあいいぃいううぅうええぇえおおぉお",
"types": [
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana"
],
"checks": [
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0
],
"offsets": [
0,
3,
4,
7,
8,
11,
12,
15,
16,
19,
20,
23,
24,
27,
28,
31,
32,
35,
36,
39
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC100("unify_kana_hyphen", true, \
"report_source_offset", true)' \
"あ-ぁ-い-ぃ-う-ぅ-え-ぇ-お-ぉ-" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
normalize 'NormalizerNFKC100("unify_kana_hyphen", true, "report_source_offset", true)' "ば-び-ぶ-べ-ぼ-" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ばあびいぶうべえぼお",
"types": [
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana"
],
"checks": [
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0
],
"offsets": [
0,
3,
4,
7,
8,
11,
12,
15,
16,
19
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC100("unify_kana_hyphen", true, \
"report_source_offset", true)' \
"ば-び-ぶ-べ-ぼ-" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
normalize 'NormalizerNFKC100("unify_kana_hyphen", true, "report_source_offset", true)' "だ-ぢ-づ-で-ど-" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "だあぢいづうでえどお",
"types": [
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana"
],
"checks": [
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0
],
"offsets": [
0,
3,
4,
7,
8,
11,
12,
15,
16,
19
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC100("unify_kana_hyphen", true, \
"report_source_offset", true)' \
"だ-ぢ-づ-で-ど-" \
WITH_CHECKS|WITH_TYPES
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
normalize 'NormalizerNFKC100("unify_kana_hyphen", true, "report_source_offset", true)' "ゔ-ゕ-ゖ-" WITH_CHECKS|WITH_TYPES
[
[
0,
0.0,
0.0
],
{
"normalized": "ゔうゕあゖえ",
"types": [
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana",
"hiragana"
],
"checks": [
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0,
3,
0,
0,
1,
0,
0
],
"offsets": [
0,
3,
4,
7,
8,
11
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
normalize \
'NormalizerNFKC100("unify_kana_hyphen", true, \
"report_source_offset", true)' \
"ゔ-ゕ-ゖ-" \
WITH_CHECKS|WITH_TYPES
Loading

0 comments on commit a1473ea

Please sign in to comment.