Add "ja_stop" filter

* can use a predefined "_japanese_" stop words * can not use other predefined stop words Closes #45
elastic · Oct 21, 2014 · 2881cc0 · 2881cc0
1 parent ff686ac
commit 2881cc0
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 2 deletions.
diff --git a/src/main/java/org/elasticsearch/index/analysis/JapaneseStopTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/JapaneseStopTokenFilterFactory.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.common.collect.ImmutableMap;
+import org.elasticsearch.common.collect.MapBuilder;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.Set;
+
+public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{
+
+
+    private final CharArraySet stopWords;
+
+    private final boolean ignoreCase;
+
+    private final boolean enablePositionIncrements;
+    private final boolean removeTrailing;
+
+    @Inject
+    public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
+        this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
+        ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
+            .put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
+            .immutableMap();
+        this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, version, ignoreCase);
+        this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
+        if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
+            throw new ElasticsearchIllegalArgumentException("[enable_position_increments: false] is not supported anymore as of Lucene 4.4 as it can create broken token streams."
+                + " Please fix your analysis chain or use an older compatibility version (<=4.3) but beware that it might cause unexpected behavior.");
+        }
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        if (removeTrailing) {
+            StopFilter filter = new StopFilter(version, tokenStream, stopWords);
+            filter.setEnablePositionIncrements(enablePositionIncrements);
+            return filter;
+        } else {
+            return new SuggestStopFilter(tokenStream, stopWords);
+        }
+    }
+
+    public Set<?> stopWords() {
+        return stopWords;
+    }
+
+    public boolean ignoreCase() {
+        return ignoreCase;
+    }
+
+    public boolean enablePositionIncrements() {
+        return this.enablePositionIncrements;
+    }
+
+}
diff --git a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@@ -55,5 +55,6 @@ public void onModule(AnalysisModule module) {
         module.addTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
         module.addTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
         module.addTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
+        module.addTokenFilter("ja_stop", JapaneseStopTokenFilterFactory.class);
     }
 }
diff --git a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@@ -71,6 +71,9 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
         filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
         assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));
 
+        filterFactory = analysisService.tokenFilter("ja_stop");
+        assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
+
         NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
         assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
 
@@ -80,6 +83,7 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
 
         CharFilterFactory  charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
         assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
+
     }
 
     @Test
@@ -167,10 +171,20 @@ public void testIterationMarkCharFilter() throws IOException {
         expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";
 
         assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
+    }
 
-
+    @Test
+    public void testJapaneseStopFilterFactory() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
+        assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
+        String source = "私は制限スピードを超える。";
+        String[] expected = new String[]{"私", "制限", "超える"};
+        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), null, true, JapaneseTokenizer.Mode.SEARCH);
+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
     }
 
+
     public AnalysisService createAnalysisService() {
         Settings settings = ImmutableSettings.settingsBuilder()
                 .loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json")

diff --git a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
@@ -13,9 +13,12 @@
                 "kuromoji_ks" : {
                     "type": "kuromoji_stemmer",
                     "minimum_length" : 6
+                },
+                "ja_stop" : {
+                    "type": "ja_stop",
+                    "stopwords": ["_japanese_", "スピード"]
                 }
 
-
             },
 
             "char_filter":{