Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for inlined user dictionary in Nori #36123

Merged
merged 12 commits into from
Dec 7, 2018
38 changes: 37 additions & 1 deletion docs/plugins/analysis-nori.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ The first token is mandatory and represents the custom noun that should be added
the dictionary. For compound nouns the custom segmentation can be provided
after the first token (`[<token 1> ... <token n>]`). The segmentation of the
custom compound nouns is controlled by the `decompound_mode` setting.
--


As a demonstration of how the user dictionary can be used, save the following
dictionary to `$ES_HOME/config/userdict_ko.txt`:
Expand Down Expand Up @@ -153,6 +153,42 @@ The above `analyze` request returns the following:
// TESTRESPONSE

<1> This is a compound token that spans two positions (`mixed` mode).
--

`user_dictionary_rules`::
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some reason this whole sections doesn't render when I build the docs locally. I played around with it a bit but couldn't get it to work but its probably worth taking another look.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, thanks. I forgot to add the end of section (e.g. --) so the whole section was not displayed. I pushed adcee29 to fix this.

+
--

You can also inline the rules directly in the tokenizer definition using
the `user_dictionary_rules` option:

[source,js]
--------------------------------------------------
PUT nori_sample
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"nori_user_dict": {
"type": "nori_tokenizer",
"decompound_mode": "mixed",
"user_dictionary_rules": ["c++", "C샤프", "세종", "세종시 세종 시"]
}
},
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "nori_user_dict"
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE
--

The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
to modify the stream.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.Locale;

public class NoriTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_OPTION = "user_dictionary";
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";

private final UserDictionary userDictionary;
private final KoreanTokenizer.DecompoundMode decompoundMode;
Expand All @@ -44,12 +47,20 @@ public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String
}

public static UserDictionary getUserDictionary(Environment env, Settings settings) {
try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) {
if (reader == null) {
return null;
} else {
return UserDictionary.open(reader);
}
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
" with [" + USER_DICT_RULES_OPTION + "]");
}
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
StringBuilder sb = new StringBuilder();
if (ruleList == null || ruleList.isEmpty()) {
return null;
}
for (String line : ruleList) {
sb.append(line).append(System.lineSeparator());
}
try (Reader rulesReader = new StringReader(sb.toString())) {
return UserDictionary.open(rulesReader);
} catch (IOException e) {
throw new ElasticsearchException("failed to load nori user dictionary", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.nio.file.Files;
import java.nio.file.Path;

import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.instanceOf;

public class NoriAnalysisTests extends ESTokenStreamTestCase {
Expand Down Expand Up @@ -76,6 +77,22 @@ public void testNoriAnalyzer() throws Exception {
}

public void testNoriAnalyzerUserDict() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "세종시")) {
assertTokenStreamContents(stream, new String[]{"세종", "시"});
}

try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
assertTokenStreamContents(stream, new String[]{"c++", "world"});
}
}

public void testNoriAnalyzerUserDictPath() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
Expand All @@ -91,6 +108,17 @@ public void testNoriAnalyzerUserDict() throws Exception {
}
}

public void testNoriAnalyzerInvalidUserDictOption() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "nori")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C샤프", "세종", "세종시 세종 시")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
"with [user_dictionary_rules]"));
}

public void testNoriTokenizer() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,21 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
String wordListPath = settings.get(settingPrefix + "_path", null);
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
}

/**
* Fetches a list of words from the specified settings file. The list should either be available at the key
* specified by <code>settingList</code> or in a file specified by <code>settingPath</code>.
*
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
String wordListPath = settings.get(settingPath, null);

if (wordListPath == null) {
List<String> explicitWordList = settings.getAsList(settingPrefix, null);
List<String> explicitWordList = settings.getAsList(settingList, null);
if (explicitWordList == null) {
return null;
} else {
Expand All @@ -238,11 +249,11 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
return loadWordList(path, "#");
} catch (CharacterCodingException ex) {
String message = String.format(Locale.ROOT,
"Unsupported character encoding detected while reading %s_path: %s - files must be UTF-8 encoded",
settingPrefix, path.toString());
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
settingPath, path.toString());
throw new IllegalArgumentException(message, ex);
} catch (IOException ioe) {
String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, path.toString());
String message = String.format(Locale.ROOT, "IOException while reading %s: %s", settingPath, path.toString());
throw new IllegalArgumentException(message, ioe);
}
}
Expand Down