From b6221c5748099e6902e2ad91c5207ab5ac2005b3 Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Sun, 15 Mar 2020 05:39:21 +0900 Subject: [PATCH 1/5] add nori_number token filter --- docs/plugins/analysis-nori.asciidoc | 49 +++++++++++++++++++ .../analysis/NoriNumberFilterFactory.java | 38 ++++++++++++++ .../analysis/nori/AnalysisNoriPlugin.java | 2 + .../analysis/AnalysisNoriFactoryTests.java | 1 + .../index/analysis/NoriAnalysisTests.java | 17 +++++++ .../test/analysis_nori/10_basic.yml | 10 ++++ 6 files changed, 117 insertions(+) create mode 100644 plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 7cc04c9c3de75..5febb98831f08 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -434,3 +434,52 @@ Which responds with: -------------------------------------------------- <1> The Hanja form is replaced by the Hangul translation. + + +[[analysis-nori-number]] +==== `nori_number` token filter + +The `nori_number` token filter normalizes Korean numbers +to regular Arabic decimal numbers in half-width characters. For example: + +[source,console] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "nori_tokenizer", + "filter": [ + "nori_number" + ] + } + } + } + } + } +} + +GET nori_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "십만이천오백" +} +-------------------------------------------------- + +Which results in: + +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [ { + "token" : "102500", + "start_offset" : 0, + "end_offset" : 6, + "type" : "word", + "position" : 0 + } ] +} +-------------------------------------------------- diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java new file mode 100644 index 0000000000000..54a5ab9c2124e --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ko.KoreanNumberFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class NoriNumberFilterFactory extends AbstractTokenFilterFactory { + + public NoriNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new KoreanNumberFilter(tokenStream); + } +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java index 6e9baa7acd26c..72097e2e83472 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.NoriAnalyzerProvider; +import org.elasticsearch.index.analysis.NoriNumberFilterFactory; import org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory; import org.elasticsearch.index.analysis.NoriReadingFormFilterFactory; import org.elasticsearch.index.analysis.NoriTokenizerFactory; @@ -42,6 +43,7 @@ public Map> getTokenFilters() { Map> extra = new HashMap<>(); extra.put("nori_part_of_speech", NoriPartOfSpeechStopFilterFactory::new); extra.put("nori_readingform", NoriReadingFormFilterFactory::new); + extra.put("nori_number", NoriNumberFilterFactory::new); return extra; } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java index 1677ba94b8783..de70e26fc6f01 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java @@ -43,6 +43,7 @@ protected Map> getTokenFilters() { Map> filters = new HashMap<>(super.getTokenFilters()); filters.put("koreanpartofspeechstop", NoriPartOfSpeechStopFilterFactory.class); filters.put("koreanreadingform", NoriReadingFormFilterFactory.class); + filters.put("koreannumber", NoriNumberFilterFactory.class); return filters; } } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index 051a2f3e4dc32..64b382e52048f 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -54,6 +54,9 @@ public void testDefaultsNoriAnalysis() throws IOException { filterFactory = analysis.tokenFilter.get("nori_readingform"); assertThat(filterFactory, instanceOf(NoriReadingFormFilterFactory.class)); + filterFactory = analysis.tokenFilter.get("nori_number"); + assertThat(filterFactory, instanceOf(NoriNumberFilterFactory.class)); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("nori"); assertThat(analyzer.analyzer(), instanceOf(KoreanAnalyzer.class)); @@ -159,6 +162,20 @@ public void testNoriReadingForm() throws IOException { assertTokenStreamContents(stream, new String[] {"향가"}); } + public void testNoriNumber() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_filter.type", "nori_number") + .build(); + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin()); + TokenFilterFactory factory = analysis.tokenFilter.get("my_filter"); + Tokenizer tokenizer = new KoreanTokenizer(); + tokenizer.setReader(new StringReader("오늘 십만이천오백원짜리 와인 구입")); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"오늘", "102500", "원", "짜리", "와인", "구입"}); + } + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt"); Path home = createTempDir(); diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml index a5aa9998da6ba..bfd041464b7e4 100644 --- a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml @@ -46,3 +46,13 @@ filter: [nori_readingform] - length: { tokens: 1 } - match: { tokens.0.token: 향가 } +--- +"Number filter": + - do: + indices.analyze: + body: + text: 십만이천오백 + tokenizer: nori_tokenizer + filter: [nori_number] + - length: { tokens: 1 } + - match: { tokens.0.token: "102500" } From d6dbe514fa70c8278f180a3a602a75601d588d39 Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Fri, 20 Mar 2020 02:00:50 +0900 Subject: [PATCH 2/5] add discard_punctuation option in nori_tokenizer --- docs/plugins/analysis-nori.asciidoc | 5 +++++ .../elasticsearch/index/analysis/NoriTokenizerFactory.java | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 5febb98831f08..2a129cbd5cc6c 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -54,6 +54,10 @@ It can be set to: 가곡역 => 가곡역, 가곡, 역 -- +`discard_punctuation`:: + + Whether punctuation should be discarded from the output. Defaults to `true`. + `user_dictionary`:: + -- @@ -99,6 +103,7 @@ PUT nori_sample "nori_user_dict": { "type": "nori_tokenizer", "decompound_mode": "mixed", + "discard_punctuation": "false", "user_dictionary": "userdict_ko.txt" } }, diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index bac5dd2a77065..9680d6fd5f80a 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -39,11 +39,13 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory { private final UserDictionary userDictionary; private final KoreanTokenizer.DecompoundMode decompoundMode; + private final boolean discardPunctuation; public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, settings, name); decompoundMode = getMode(settings); userDictionary = getUserDictionary(env, settings); + discardPunctuation = settings.getAsBoolean("discard_punctuation", true); } public static UserDictionary getUserDictionary(Environment env, Settings settings) { @@ -77,7 +79,8 @@ public static KoreanTokenizer.DecompoundMode getMode(Settings settings) { @Override public Tokenizer create() { - return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false); + return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false, + discardPunctuation); } } From 9bd0ebe9ab119a49f6a829d308d8238d24050ea2 Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Fri, 20 Mar 2020 02:23:43 +0900 Subject: [PATCH 3/5] add description about using discard_punctuation in nori_number --- docs/plugins/analysis-nori.asciidoc | 65 +++++++++++++++++-- .../test/analysis_nori/10_basic.yml | 23 ++++--- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 2a129cbd5cc6c..fc89c49a6b8aa 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -445,7 +445,36 @@ Which responds with: ==== `nori_number` token filter The `nori_number` token filter normalizes Korean numbers -to regular Arabic decimal numbers in half-width characters. For example: +to regular Arabic decimal numbers in half-width characters. + +Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds punctuation. +For example, 3.2천 means 3200. +This filter does this kind of normalization and allows a search for 3200 to match 3.2천 in text, +but can also be used to make range facets based on the normalized numbers and so on. + +Notice that this analyzer uses a token composition scheme and relies on punctuation tokens +being found in the token stream. +Please make sure your `nori_tokenizer` has `discard_punctuation` set to false. +In case punctuation characters, such as U+FF0E(.), is removed from the token stream, +this filter would find input tokens 3 and 2천 and give outputs 3 and 2000 instead of 3200, +which is likely not the intended result. + +If you want to remove punctuation characters from your index that are not part of normalized numbers, +add a `stop` token filter with the punctuation you wish to remove after `nori_number` in your analyzer chain. + +Below are some examples of normalizations this filter supports. +The input is untokenized text and the result is the single term attribute emitted for the input. + +- 영영칠 -> 7 +- 일영영영 -> 1000 +- 삼천2백2십삼 -> 3223 +- 조육백만오천일 -> 1000006005001 +- 3.2천 -> 3200 +- 1.2만345.67 -> 12345.67 +- 4,647.100 -> 4647.1 +- 15,7 -> 157 (be aware of this weakness) + +For example: [source,console] -------------------------------------------------- @@ -456,11 +485,23 @@ PUT nori_sample "analysis": { "analyzer": { "my_analyzer": { - "tokenizer": "nori_tokenizer", + "tokenizer": "tokenizer_discard_puncuation_false", "filter": [ - "nori_number" + "part_of_speech_stop_sp", "nori_number" ] } + }, + "tokenizer": { + "tokenizer_discard_puncuation_false": { + "type": "nori_tokenizer", + "discard_punctuation": "false" + } + }, + "filter": { + "part_of_speech_stop_sp": { + "type": "nori_part_of_speech", + "stoptags": ["SP"] + } } } } @@ -470,7 +511,7 @@ PUT nori_sample GET nori_sample/_analyze { "analyzer": "my_analyzer", - "text": "십만이천오백" + "text": "십만이천오백과 3.2천" } -------------------------------------------------- @@ -479,12 +520,24 @@ Which results in: [source,console-result] -------------------------------------------------- { - "tokens" : [ { + "tokens" : [{ "token" : "102500", "start_offset" : 0, "end_offset" : 6, "type" : "word", "position" : 0 - } ] + }, { + "token" : "과", + "start_offset" : 6, + "end_offset" : 7, + "type" : "word", + "position" : 1 + }, { + "token" : "3200", + "start_offset" : 8, + "end_offset" : 12, + "type" : "word", + "position" : 2 + }] } -------------------------------------------------- diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml index bfd041464b7e4..694ae0d49dcda 100644 --- a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml @@ -48,11 +48,18 @@ - match: { tokens.0.token: 향가 } --- "Number filter": - - do: - indices.analyze: - body: - text: 십만이천오백 - tokenizer: nori_tokenizer - filter: [nori_number] - - length: { tokens: 1 } - - match: { tokens.0.token: "102500" } + - do: + indices.analyze: + body: + text: 십만이천오백과 3.2천 + tokenizer: + type: nori_tokenizer + discard_punctuation: false + filter: + - type: nori_part_of_speech + stoptags: ["SP"] + - type: nori_number + - length: { tokens: 3 } + - match: { tokens.0.token: "102500"} + - match: { tokens.1.token: 과} + - match: { tokens.2.token: "3200"} From 6ed33a19b7a9f5e4c6dc9b628cf2f6202113c25d Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Sat, 21 Mar 2020 03:24:03 +0900 Subject: [PATCH 4/5] add note in asciidoc and test cases for discard_punctuation --- docs/plugins/analysis-nori.asciidoc | 4 ++- .../index/analysis/NoriAnalysisTests.java | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index fc89c49a6b8aa..1e5f998a72da4 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -452,6 +452,8 @@ For example, 3.2천 means 3200. This filter does this kind of normalization and allows a search for 3200 to match 3.2천 in text, but can also be used to make range facets based on the normalized numbers and so on. +[NOTE] +==== Notice that this analyzer uses a token composition scheme and relies on punctuation tokens being found in the token stream. Please make sure your `nori_tokenizer` has `discard_punctuation` set to false. @@ -461,7 +463,7 @@ which is likely not the intended result. If you want to remove punctuation characters from your index that are not part of normalized numbers, add a `stop` token filter with the punctuation you wish to remove after `nori_number` in your analyzer chain. - +==== Below are some examples of normalizations this filter supports. The input is untokenized text and the result is the single term attribute emitted for the input. diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index 64b382e52048f..87c78c7f981b9 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -133,6 +133,33 @@ public void testNoriTokenizer() throws Exception { assertTokenStreamContents(tokenizer, new String[] {"뿌리", "가", "깊", "은", "나무"}); tokenizer.setReader(new StringReader("가늠표")); assertTokenStreamContents(tokenizer, new String[] {"가늠표", "가늠", "표"}); + // discard_punctuation default(true) + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"}); + } + + public void testNoriTokenizerDiscardPunctuationOptionTrue() throws Exception { + Settings settings = createDiscardPunctuationOption("true"); + TestAnalysis analysis = createTestAnalysis(settings); + Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create(); + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"}); + } + + public void testNoriTokenizerDiscardPunctuationOptionFalse() throws Exception { + Settings settings = createDiscardPunctuationOption("false"); + TestAnalysis analysis = createTestAnalysis(settings); + Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create(); + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", ".", "2", "개"}); + } + + public void testNoriTokenizerInvalidDiscardPunctuationOption() { + String wrongOption = "wrong"; + Settings settings = createDiscardPunctuationOption(wrongOption); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("Failed to parse value [" + wrongOption + + "] as only [true] or [false] are allowed.")); } public void testNoriPartOfSpeech() throws IOException { @@ -176,6 +203,13 @@ public void testNoriNumber() throws IOException { assertTokenStreamContents(stream, new String[] {"오늘", "102500", "원", "짜리", "와인", "구입"}); } + private Settings createDiscardPunctuationOption(String option) { + return Settings.builder() + .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer") + .put("index.analysis.tokenizer.my_tokenizer.discard_punctuation", option) + .build(); + } + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt"); Path home = createTempDir(); From 1a4367ea993cfc34d7d034e03c5b9cb80790fe3c Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Tue, 24 Mar 2020 01:57:16 +0900 Subject: [PATCH 5/5] fix wrong indentation in nori_number test --- .../test/analysis_nori/10_basic.yml | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml index 694ae0d49dcda..523874f5743bb 100644 --- a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml @@ -48,18 +48,18 @@ - match: { tokens.0.token: 향가 } --- "Number filter": - - do: - indices.analyze: - body: - text: 십만이천오백과 3.2천 - tokenizer: - type: nori_tokenizer - discard_punctuation: false - filter: - - type: nori_part_of_speech - stoptags: ["SP"] - - type: nori_number - - length: { tokens: 3 } - - match: { tokens.0.token: "102500"} - - match: { tokens.1.token: 과} - - match: { tokens.2.token: "3200"} + - do: + indices.analyze: + body: + text: 십만이천오백과 3.2천 + tokenizer: + type: nori_tokenizer + discard_punctuation: false + filter: + - type: nori_part_of_speech + stoptags: ["SP"] + - type: nori_number + - length: { tokens: 3 } + - match: { tokens.0.token: "102500"} + - match: { tokens.1.token: 과} + - match: { tokens.2.token: "3200"}