Skip to content

Commit

Permalink
Add persian language stemmer
Browse files Browse the repository at this point in the history
  • Loading branch information
rezatorabi committed Aug 31, 2023
1 parent 8c89f31 commit 3a38385
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 0 deletions.
Expand Up @@ -322,6 +322,7 @@ public TokenStream create(TokenStream tokenStream) {
filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("persian_stem", PersianStemTokenFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put(
"predicate_token_filter",
Expand Down
@@ -0,0 +1,28 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory {

PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new PersianStemFilter(tokenStream);
}
}
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
Expand Down Expand Up @@ -213,6 +214,10 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);

// Persian stemmers
} else if ("persian".equalsIgnoreCase(language)) {
return new PersianStemFilter(tokenStream);

// Portuguese stemmers
} else if ("portuguese".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new PortugueseStemmer());
Expand Down
Expand Up @@ -117,6 +117,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
filters.put("persiannormalization", PersianNormalizationFilterFactory.class);
filters.put("persianstem", PersianStemTokenFilterFactory.class);
filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class);
filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
Expand Down

0 comments on commit 3a38385

Please sign in to comment.