Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
TokenNgram: fix wrong first character length
It's caused for U+3231 PARENTHESIZED IDEOGRAPH characters such as U+3231 PARENTHESIZED IDEOGRAPH STOCK.
- Loading branch information
Showing
5 changed files
with
218 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.expected
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| table_create Entries TABLE_NO_KEY | ||
| [[0,0.0,0.0],true] | ||
| column_create Entries body COLUMN_SCALAR ShortText | ||
| [[0,0.0,0.0],true] | ||
| table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer 'TokenNgram("loose_symbol", true, "report_source_location", true)' --normalizer 'NormalizerNFKC100' | ||
| [[0,0.0,0.0],true] | ||
| column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body | ||
| [[0,0.0,0.0],true] | ||
| load --table Entries | ||
| [ | ||
| {"body": "ここは㈱グルンガ"} | ||
| ] | ||
| [[0,0.0,0.0],1] | ||
| select Entries --match_columns body --query '株グル' --output_columns 'highlight_html(body, Terms)' | ||
| [ | ||
| [ | ||
| 0, | ||
| 0.0, | ||
| 0.0 | ||
| ], | ||
| [ | ||
| [ | ||
| [ | ||
| 1 | ||
| ], | ||
| [ | ||
| [ | ||
| "highlight_html", | ||
| null | ||
| ] | ||
| ], | ||
| [ | ||
| "ここは<span class=\"keyword\">㈱グル</span>ンガ" | ||
| ] | ||
| ] | ||
| ] | ||
| ] |
19 changes: 19 additions & 0 deletions
19
test/command/suite/select/function/highlight_html/lexicon/loose_symbol_kabu.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| table_create Entries TABLE_NO_KEY | ||
| column_create Entries body COLUMN_SCALAR ShortText | ||
|
|
||
| table_create Terms TABLE_PAT_KEY ShortText \ | ||
| --default_tokenizer 'TokenNgram("loose_symbol", true, \ | ||
| "report_source_location", true)' \ | ||
| --normalizer 'NormalizerNFKC100' | ||
| column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body | ||
|
|
||
| load --table Entries | ||
| [ | ||
| {"body": "ここは㈱グルンガ"} | ||
| ] | ||
|
|
||
| select Entries \ | ||
| --match_columns body \ | ||
| --query '株グル' \ | ||
| --output_columns 'highlight_html(body, Terms)' | ||
|
|
154 changes: 154 additions & 0 deletions
154
test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.expected
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,154 @@ | ||
| tokenize 'TokenNgram("loose_symbol", true, "report_source_location", true)' "ここは㈱グルンガ" 'NormalizerNFKC100("report_source_offset", true)' | ||
| [ | ||
| [ | ||
| 0, | ||
| 0.0, | ||
| 0.0 | ||
| ], | ||
| [ | ||
| { | ||
| "value": "ここ", | ||
| "position": 0, | ||
| "force_prefix": false, | ||
| "source_offset": 0, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "こは", | ||
| "position": 1, | ||
| "force_prefix": false, | ||
| "source_offset": 3, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "は", | ||
| "position": 2, | ||
| "force_prefix": false, | ||
| "source_offset": 6, | ||
| "source_length": 3, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "(", | ||
| "position": 3, | ||
| "force_prefix": false, | ||
| "source_offset": 9, | ||
| "source_length": 0, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "株", | ||
| "position": 4, | ||
| "force_prefix": false, | ||
| "source_offset": 9, | ||
| "source_length": 0, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": ")", | ||
| "position": 5, | ||
| "force_prefix": false, | ||
| "source_offset": 9, | ||
| "source_length": 3, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "グル", | ||
| "position": 6, | ||
| "force_prefix": false, | ||
| "source_offset": 12, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "ルン", | ||
| "position": 7, | ||
| "force_prefix": false, | ||
| "source_offset": 15, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "ンガ", | ||
| "position": 8, | ||
| "force_prefix": false, | ||
| "source_offset": 18, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "", | ||
| "position": 9, | ||
| "force_prefix": false, | ||
| "source_offset": 24, | ||
| "source_length": 0, | ||
| "source_first_character_length": 0 | ||
| }, | ||
| { | ||
| "value": "ここ", | ||
| "position": 10, | ||
| "force_prefix": false, | ||
| "source_offset": 0, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "こは", | ||
| "position": 11, | ||
| "force_prefix": false, | ||
| "source_offset": 3, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "は株", | ||
| "position": 12, | ||
| "force_prefix": false, | ||
| "source_offset": 6, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "株グ", | ||
| "position": 13, | ||
| "force_prefix": false, | ||
| "source_offset": 9, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "グル", | ||
| "position": 14, | ||
| "force_prefix": false, | ||
| "source_offset": 12, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "ルン", | ||
| "position": 15, | ||
| "force_prefix": false, | ||
| "source_offset": 15, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "ンガ", | ||
| "position": 16, | ||
| "force_prefix": false, | ||
| "source_offset": 18, | ||
| "source_length": 6, | ||
| "source_first_character_length": 3 | ||
| }, | ||
| { | ||
| "value": "ガ", | ||
| "position": 17, | ||
| "force_prefix": false, | ||
| "source_offset": 21, | ||
| "source_length": 3, | ||
| "source_first_character_length": 3 | ||
| } | ||
| ] | ||
| ] |
5 changes: 5 additions & 0 deletions
5
test/command/suite/tokenizers/ngram/report_source_location/loose_symbol_kabu.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| tokenize \ | ||
| 'TokenNgram("loose_symbol", true, \ | ||
| "report_source_location", true)' \ | ||
| "ここは㈱グルンガ" \ | ||
| 'NormalizerNFKC100("report_source_offset", true)' |