Skip to content

Commit

Permalink
Add "ja_stop" filter
Browse files Browse the repository at this point in the history
 * can use a predefined "_japanese_" stop words
 * can not use other predefined stop words
 * upgrade to lucene 5
 * add ja_stop to README

  Closes #45
  • Loading branch information
johtani committed Nov 26, 2014
1 parent 4ebd6fb commit 21bfe65
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 17 deletions.
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
| kuromoji_part_of_speech | tokenfilter |
| kuromoji_readingform | tokenfilter |
| kuromoji_stemmer | tokenfilter |
| ja_stop | tokenfilter |


Usage
Expand Down Expand Up @@ -391,6 +392,50 @@ curl -XPOST 'http://localhost:9200/kuromoji_sample/_analyze?analyzer=my_analyzer
```


## TokenFilter : kuromoji_part_of_speech


A token filter of type `ja_stop` that provide a predefined "_japanese_" stop words.
*Note: It is only provide "_japanese_". If you want to use other predefined stop words, you can use `stop` token filter.*

### example

```sh
curl -XPUT 'http://localhost:9200/kuromoji_sample/' -d'
{
"settings": {
"index":{
"analysis":{
"analyzer" : {
"analyzer_with_ja_stop" : {
"tokenizer" : "kuromoji_tokenizer",
"filter" : ["ja_stop"]
}
},
"filter" : {
"ja_stop" : {
"type" : "ja_stop",
"stopwords" : ["_japanese_", "ストップ"]
}
}
}
}
}
}
'

curl -XPOST 'http://localhost:9200/kuromoji_sample/_analyze?analyzer=katakana_analyzer&pretty' -d 'ストップは消える'
{
"tokens" : [ {
"token" : "消える",
"start_offset" : 5,
"end_offset" : 8,
"type" : "word",
"position" : 3
} ]
}
```

License
-------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
import org.apache.lucene.util.Version;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.collect.ImmutableMap;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
Expand All @@ -45,7 +43,6 @@ public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{

private final boolean ignoreCase;

private final boolean enablePositionIncrements;
private final boolean removeTrailing;

@Inject
Expand All @@ -56,20 +53,13 @@ public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings index
ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
.put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
.immutableMap();
this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, version, ignoreCase);
this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
throw new ElasticsearchIllegalArgumentException("[enable_position_increments: false] is not supported anymore as of Lucene 4.4 as it can create broken token streams."
+ " Please fix your analysis chain or use an older compatibility version (<=4.3) but beware that it might cause unexpected behavior.");
}
this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, ignoreCase);
}

@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
StopFilter filter = new StopFilter(version, tokenStream, stopWords);
filter.setEnablePositionIncrements(enablePositionIncrements);
return filter;
return new StopFilter(tokenStream, stopWords);
} else {
return new SuggestStopFilter(tokenStream, stopWords);
}
Expand All @@ -83,8 +73,4 @@ public boolean ignoreCase() {
return ignoreCase;
}

public boolean enablePositionIncrements() {
return this.enablePositionIncrements;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ public void testJapaneseStopFilterFactory() throws IOException {
assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
String source = "私は制限スピードを超える。";
String[] expected = new String[]{"私", "制限", "超える"};
Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), null, true, JapaneseTokenizer.Mode.SEARCH);
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

Expand Down

0 comments on commit 21bfe65

Please sign in to comment.