Skip to content

Commit

Permalink
Expose Japanese completion filter to kuromoji analysis plugin (#81858)
Browse files Browse the repository at this point in the history
This adds analysis factories of JapaneseCompletionFilter and JapaneseCompletionAnalyzer (https://issues.apache.org/jira/browse/LUCENE-10102) to the kuromoji plugin.
  • Loading branch information
mocobeta committed Jan 31, 2022
1 parent 08d4f55 commit 997a600
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
extra.put("ja_stop", JapaneseStopTokenFilterFactory::new);
extra.put("kuromoji_number", KuromojiNumberFilterFactory::new);
extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new);
return extra;
}

Expand All @@ -47,6 +48,9 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
return singletonMap("kuromoji", KuromojiAnalyzerProvider::new);
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
extra.put("kuromoji", KuromojiAnalyzerProvider::new);
extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new);
return extra;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.plugin.analysis.kuromoji;

import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;

public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseCompletionAnalyzer> {

private final JapaneseCompletionAnalyzer analyzer;

public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
final Mode mode = KuromojiCompletionFilterFactory.getMode(settings);
analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode);
}

@Override
public JapaneseCompletionAnalyzer get() {
return analyzer;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.plugin.analysis.kuromoji;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory {

private final Mode mode;

public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
mode = getMode(settings);
}

public static JapaneseCompletionFilter.Mode getMode(Settings settings) {
JapaneseCompletionFilter.Mode mode = Mode.INDEX;
String modeSetting = settings.get("mode", null);
if (modeSetting != null) {
if ("index".equalsIgnoreCase(modeSetting)) {
mode = JapaneseCompletionFilter.Mode.INDEX;
} else if ("query".equalsIgnoreCase(modeSetting)) {
mode = JapaneseCompletionFilter.Mode.QUERY;
}
}
return mode;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new JapaneseCompletionFilter(tokenStream, mode);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
Expand Down Expand Up @@ -67,10 +68,16 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
filterFactory = analysis.tokenFilter.get("kuromoji_number");
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("my_analyzer");
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
Expand Down Expand Up @@ -225,6 +232,42 @@ public void testJapaneseStopFilterFactory() throws IOException {
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

public void testCompletionFilterFactory() throws IOException {
// mode=INDEX
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index");
assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class));
String source = "東京都";
String[] expected_tokens = new String[] { "東京", "toukyou", "都", "to" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode=QUERY
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query");
assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class));
source = "サッk";
expected_tokens = new String[] { "サッk", "sakk" };
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
}

public void testCompletionAnalyzer() throws IOException {
// mode=INDEX
TestAnalysis analysis = createTestAnalysis();
Analyzer analyzer = analysis.indexAnalyzers.get("completion_index_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) {
assertTokenStreamContents(stream, new String[] { "ソース", "soーsu", "コード", "koーdo" });
}

// mode=QUERY
analyzer = analysis.indexAnalyzers.get("completion_query_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) {
assertTokenStreamContents(stream, new String[] { "ソースコード", "soーsukoーdo" });
}
}

private static TestAnalysis createTestAnalysis() throws IOException {
InputStream empty_dict = KuromojiAnalysisTests.class.getResourceAsStream("empty_user_dict.txt");
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
},
"kuromoji_completion_index" : {
"type": "kuromoji_completion",
"mode": "index"
},
"kuromoji_completion_query" : {
"type": "kuromoji_completion",
"mode": "query"
}
},

Expand Down Expand Up @@ -70,6 +78,14 @@
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "kuromoji_tokenizer"
},
"completion_index_analyzer" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"completion_query_analyzer" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,41 @@
filter: [kuromoji_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: サーバ }
---
"Completion analyzer":
- do:
indices.create:
index: kuromoji_completion_sample
body:
settings:
index:
analysis:
analyzer:
completion_index:
type: kuromoji_completion
mode: index
completion_query:
type: kuromoji_completion
mode: query

- do:
indices.analyze:
index: kuromoji_completion_sample
body:
text: ソースコード
analyzer: completion_index
- length: { tokens: 4 }
- match: { tokens.0.token: ソース }
- match: { tokens.1.token: soーsu }
- match: { tokens.2.token: コード }
- match: { tokens.3.token: koーdo }

- do:
indices.analyze:
index: kuromoji_completion_sample
body:
text: ソースコード
analyzer: completion_query
- length: { tokens: 2 }
- match: { tokens.0.token: ソースコード }
- match: { tokens.1.token: soーsukoーdo }

0 comments on commit 997a600

Please sign in to comment.