diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc index 7cc04c9c3de75..1e5f998a72da4 100644 --- a/docs/plugins/analysis-nori.asciidoc +++ b/docs/plugins/analysis-nori.asciidoc @@ -54,6 +54,10 @@ It can be set to: 가곡역 => 가곡역, 가곡, 역 -- +`discard_punctuation`:: + + Whether punctuation should be discarded from the output. Defaults to `true`. + `user_dictionary`:: + -- @@ -99,6 +103,7 @@ PUT nori_sample "nori_user_dict": { "type": "nori_tokenizer", "decompound_mode": "mixed", + "discard_punctuation": "false", "user_dictionary": "userdict_ko.txt" } }, @@ -434,3 +439,107 @@ Which responds with: -------------------------------------------------- <1> The Hanja form is replaced by the Hangul translation. + + +[[analysis-nori-number]] +==== `nori_number` token filter + +The `nori_number` token filter normalizes Korean numbers +to regular Arabic decimal numbers in half-width characters. + +Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds punctuation. +For example, 3.2천 means 3200. +This filter does this kind of normalization and allows a search for 3200 to match 3.2천 in text, +but can also be used to make range facets based on the normalized numbers and so on. + +[NOTE] +==== +Notice that this analyzer uses a token composition scheme and relies on punctuation tokens +being found in the token stream. +Please make sure your `nori_tokenizer` has `discard_punctuation` set to false. +In case punctuation characters, such as U+FF0E(.), is removed from the token stream, +this filter would find input tokens 3 and 2천 and give outputs 3 and 2000 instead of 3200, +which is likely not the intended result. + +If you want to remove punctuation characters from your index that are not part of normalized numbers, +add a `stop` token filter with the punctuation you wish to remove after `nori_number` in your analyzer chain. +==== +Below are some examples of normalizations this filter supports. +The input is untokenized text and the result is the single term attribute emitted for the input. + +- 영영칠 -> 7 +- 일영영영 -> 1000 +- 삼천2백2십삼 -> 3223 +- 조육백만오천일 -> 1000006005001 +- 3.2천 -> 3200 +- 1.2만345.67 -> 12345.67 +- 4,647.100 -> 4647.1 +- 15,7 -> 157 (be aware of this weakness) + +For example: + +[source,console] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "tokenizer_discard_puncuation_false", + "filter": [ + "part_of_speech_stop_sp", "nori_number" + ] + } + }, + "tokenizer": { + "tokenizer_discard_puncuation_false": { + "type": "nori_tokenizer", + "discard_punctuation": "false" + } + }, + "filter": { + "part_of_speech_stop_sp": { + "type": "nori_part_of_speech", + "stoptags": ["SP"] + } + } + } + } + } +} + +GET nori_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "십만이천오백과 3.2천" +} +-------------------------------------------------- + +Which results in: + +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [{ + "token" : "102500", + "start_offset" : 0, + "end_offset" : 6, + "type" : "word", + "position" : 0 + }, { + "token" : "과", + "start_offset" : 6, + "end_offset" : 7, + "type" : "word", + "position" : 1 + }, { + "token" : "3200", + "start_offset" : 8, + "end_offset" : 12, + "type" : "word", + "position" : 2 + }] +} +-------------------------------------------------- diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java new file mode 100644 index 0000000000000..54a5ab9c2124e --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriNumberFilterFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ko.KoreanNumberFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class NoriNumberFilterFactory extends AbstractTokenFilterFactory { + + public NoriNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new KoreanNumberFilter(tokenStream); + } +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index bac5dd2a77065..9680d6fd5f80a 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -39,11 +39,13 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory { private final UserDictionary userDictionary; private final KoreanTokenizer.DecompoundMode decompoundMode; + private final boolean discardPunctuation; public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, settings, name); decompoundMode = getMode(settings); userDictionary = getUserDictionary(env, settings); + discardPunctuation = settings.getAsBoolean("discard_punctuation", true); } public static UserDictionary getUserDictionary(Environment env, Settings settings) { @@ -77,7 +79,8 @@ public static KoreanTokenizer.DecompoundMode getMode(Settings settings) { @Override public Tokenizer create() { - return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false); + return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false, + discardPunctuation); } } diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java index 6e9baa7acd26c..72097e2e83472 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.NoriAnalyzerProvider; +import org.elasticsearch.index.analysis.NoriNumberFilterFactory; import org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory; import org.elasticsearch.index.analysis.NoriReadingFormFilterFactory; import org.elasticsearch.index.analysis.NoriTokenizerFactory; @@ -42,6 +43,7 @@ public Map> getTokenFilters() { Map> extra = new HashMap<>(); extra.put("nori_part_of_speech", NoriPartOfSpeechStopFilterFactory::new); extra.put("nori_readingform", NoriReadingFormFilterFactory::new); + extra.put("nori_number", NoriNumberFilterFactory::new); return extra; } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java index 1677ba94b8783..de70e26fc6f01 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java @@ -43,6 +43,7 @@ protected Map> getTokenFilters() { Map> filters = new HashMap<>(super.getTokenFilters()); filters.put("koreanpartofspeechstop", NoriPartOfSpeechStopFilterFactory.class); filters.put("koreanreadingform", NoriReadingFormFilterFactory.class); + filters.put("koreannumber", NoriNumberFilterFactory.class); return filters; } } diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java index 051a2f3e4dc32..87c78c7f981b9 100644 --- a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -54,6 +54,9 @@ public void testDefaultsNoriAnalysis() throws IOException { filterFactory = analysis.tokenFilter.get("nori_readingform"); assertThat(filterFactory, instanceOf(NoriReadingFormFilterFactory.class)); + filterFactory = analysis.tokenFilter.get("nori_number"); + assertThat(filterFactory, instanceOf(NoriNumberFilterFactory.class)); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("nori"); assertThat(analyzer.analyzer(), instanceOf(KoreanAnalyzer.class)); @@ -130,6 +133,33 @@ public void testNoriTokenizer() throws Exception { assertTokenStreamContents(tokenizer, new String[] {"뿌리", "가", "깊", "은", "나무"}); tokenizer.setReader(new StringReader("가늠표")); assertTokenStreamContents(tokenizer, new String[] {"가늠표", "가늠", "표"}); + // discard_punctuation default(true) + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"}); + } + + public void testNoriTokenizerDiscardPunctuationOptionTrue() throws Exception { + Settings settings = createDiscardPunctuationOption("true"); + TestAnalysis analysis = createTestAnalysis(settings); + Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create(); + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"}); + } + + public void testNoriTokenizerDiscardPunctuationOptionFalse() throws Exception { + Settings settings = createDiscardPunctuationOption("false"); + TestAnalysis analysis = createTestAnalysis(settings); + Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create(); + tokenizer.setReader(new StringReader("3.2개")); + assertTokenStreamContents(tokenizer, new String[] {"3", ".", "2", "개"}); + } + + public void testNoriTokenizerInvalidDiscardPunctuationOption() { + String wrongOption = "wrong"; + Settings settings = createDiscardPunctuationOption(wrongOption); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("Failed to parse value [" + wrongOption + + "] as only [true] or [false] are allowed.")); } public void testNoriPartOfSpeech() throws IOException { @@ -159,6 +189,27 @@ public void testNoriReadingForm() throws IOException { assertTokenStreamContents(stream, new String[] {"향가"}); } + public void testNoriNumber() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_filter.type", "nori_number") + .build(); + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin()); + TokenFilterFactory factory = analysis.tokenFilter.get("my_filter"); + Tokenizer tokenizer = new KoreanTokenizer(); + tokenizer.setReader(new StringReader("오늘 십만이천오백원짜리 와인 구입")); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"오늘", "102500", "원", "짜리", "와인", "구입"}); + } + + private Settings createDiscardPunctuationOption(String option) { + return Settings.builder() + .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer") + .put("index.analysis.tokenizer.my_tokenizer.discard_punctuation", option) + .build(); + } + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt"); Path home = createTempDir(); diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml index a5aa9998da6ba..523874f5743bb 100644 --- a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml @@ -46,3 +46,20 @@ filter: [nori_readingform] - length: { tokens: 1 } - match: { tokens.0.token: 향가 } +--- +"Number filter": + - do: + indices.analyze: + body: + text: 십만이천오백과 3.2천 + tokenizer: + type: nori_tokenizer + discard_punctuation: false + filter: + - type: nori_part_of_speech + stoptags: ["SP"] + - type: nori_number + - length: { tokens: 3 } + - match: { tokens.0.token: "102500"} + - match: { tokens.1.token: 과} + - match: { tokens.2.token: "3200"}