Navigation Menu

Skip to content

Commit

Permalink
TokenNgram: fix wrong first character length
Browse files Browse the repository at this point in the history
It's caused for U+3231 PARENTHESIZED IDEOGRAPH characters such as
U+3231 PARENTHESIZED IDEOGRAPH STOCK.
  • Loading branch information
kou committed Jun 26, 2018
1 parent 733e399 commit 49cae7d
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 1 deletion.
4 changes: 3 additions & 1 deletion lib/tokenizers.c
Expand Up @@ -416,7 +416,9 @@ ngram_switch_to_loose_mode(grn_ctx *ctx,
removed_checks = NULL;
}
for (i = 0; i < length; i++) {
loose_checks[i] += checks[i];
if (checks[i] != -1) {
loose_checks[i] += checks[i];
}
}
loose_checks += length;
}
Expand Down
@@ -0,0 +1,37 @@
table_create Entries TABLE_NO_KEY
[[0,0.0,0.0],true]
column_create Entries body COLUMN_SCALAR ShortText
[[0,0.0,0.0],true]
table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("loose_symbol", true, "report_source_location", true)' --normalizer 'NormalizerNFKC100'
[[0,0.0,0.0],true]
column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
[[0,0.0,0.0],true]
load --table Entries
[
{"body": "ここは㈱グルンガ"}
]
[[0,0.0,0.0],1]
select Entries --match_columns body --query '株グル' --output_columns 'highlight_html(body, Terms)'
[
[
0,
0.0,
0.0
],
[
[
[
1
],
[
[
"highlight_html",
null
]
],
[
"ここは<span class=\"keyword\">㈱グル</span>ンガ"
]
]
]
]
@@ -0,0 +1,19 @@
table_create Entries TABLE_NO_KEY
column_create Entries body COLUMN_SCALAR ShortText

table_create Terms TABLE_PAT_KEY ShortText \
--default_tokenizer 'TokenNgram("loose_symbol", true, \
"report_source_location", true)' \
--normalizer 'NormalizerNFKC100'
column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body

load --table Entries
[
{"body": "ここは㈱グルンガ"}
]

select Entries \
--match_columns body \
--query '株グル' \
--output_columns 'highlight_html(body, Terms)'

@@ -0,0 +1,154 @@
tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "ここは㈱グルンガ" 'NormalizerNFKC100("report_source_offset", true)'
[
[
0,
0.0,
0.0
],
[
{
"value": "ここ",
"position": 0,
"force_prefix": false,
"source_offset": 0,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "こは",
"position": 1,
"force_prefix": false,
"source_offset": 3,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "は",
"position": 2,
"force_prefix": false,
"source_offset": 6,
"source_length": 3,
"source_first_character_length": 3
},
{
"value": "(",
"position": 3,
"force_prefix": false,
"source_offset": 9,
"source_length": 0,
"source_first_character_length": 3
},
{
"value": "株",
"position": 4,
"force_prefix": false,
"source_offset": 9,
"source_length": 0,
"source_first_character_length": 3
},
{
"value": ")",
"position": 5,
"force_prefix": false,
"source_offset": 9,
"source_length": 3,
"source_first_character_length": 3
},
{
"value": "グル",
"position": 6,
"force_prefix": false,
"source_offset": 12,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "ルン",
"position": 7,
"force_prefix": false,
"source_offset": 15,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "ンガ",
"position": 8,
"force_prefix": false,
"source_offset": 18,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "￰",
"position": 9,
"force_prefix": false,
"source_offset": 24,
"source_length": 0,
"source_first_character_length": 0
},
{
"value": "ここ",
"position": 10,
"force_prefix": false,
"source_offset": 0,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "こは",
"position": 11,
"force_prefix": false,
"source_offset": 3,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "は株",
"position": 12,
"force_prefix": false,
"source_offset": 6,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "株グ",
"position": 13,
"force_prefix": false,
"source_offset": 9,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "グル",
"position": 14,
"force_prefix": false,
"source_offset": 12,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "ルン",
"position": 15,
"force_prefix": false,
"source_offset": 15,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "ンガ",
"position": 16,
"force_prefix": false,
"source_offset": 18,
"source_length": 6,
"source_first_character_length": 3
},
{
"value": "ガ",
"position": 17,
"force_prefix": false,
"source_offset": 21,
"source_length": 3,
"source_first_character_length": 3
}
]
]
@@ -0,0 +1,5 @@
tokenize \
'TokenNgram("loose_symbol", true, \
"report_source_location", true)' \
"ここは㈱グルンガ" \
'NormalizerNFKC100("report_source_offset", true)'

0 comments on commit 49cae7d

Please sign in to comment.