Skip to content

Commit

Permalink
Add "ja_stop" filter
Browse files Browse the repository at this point in the history
 * can use a predefined "_japanese_" stop words
 * can not use other predefined stop words

Closes #45
  • Loading branch information
johtani committed Oct 21, 2014
1 parent ff686ac commit 2881cc0
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;


import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
import org.apache.lucene.util.Version;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;

import java.util.Set;

public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{


private final CharArraySet stopWords;

private final boolean ignoreCase;

private final boolean enablePositionIncrements;
private final boolean removeTrailing;

@Inject
public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
.put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
.immutableMap();
this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, version, ignoreCase);
this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
throw new ElasticsearchIllegalArgumentException("[enable_position_increments: false] is not supported anymore as of Lucene 4.4 as it can create broken token streams."
+ " Please fix your analysis chain or use an older compatibility version (<=4.3) but beware that it might cause unexpected behavior.");
}
}

@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
StopFilter filter = new StopFilter(version, tokenStream, stopWords);
filter.setEnablePositionIncrements(enablePositionIncrements);
return filter;
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
}

public Set<?> stopWords() {
return stopWords;
}

public boolean ignoreCase() {
return ignoreCase;
}

public boolean enablePositionIncrements() {
return this.enablePositionIncrements;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,6 @@ public void onModule(AnalysisModule module) {
module.addTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
module.addTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
module.addTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
module.addTokenFilter("ja_stop", JapaneseStopTokenFilterFactory.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

filterFactory = analysisService.tokenFilter("ja_stop");
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

Expand All @@ -80,6 +83,7 @@ public void testDefaultsKuromojiAnalysis() throws IOException {

CharFilterFactory charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));

}

@Test
Expand Down Expand Up @@ -167,10 +171,20 @@ public void testIterationMarkCharFilter() throws IOException {
expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";

assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
}


@Test
public void testJapaneseStopFilterFactory() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[]{"私", "制限", "超える"};
Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), null, true, JapaneseTokenizer.Mode.SEARCH);
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}


public AnalysisService createAnalysisService() {
Settings settings = ImmutableSettings.settingsBuilder()
.loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
"kuromoji_ks" : {
"type": "kuromoji_stemmer",
"minimum_length" : 6
},
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
}


},

"char_filter":{
Expand Down

0 comments on commit 2881cc0

Please sign in to comment.