Skip to content

Commit

Permalink
Add persian language stemmer (#99106)
Browse files Browse the repository at this point in the history
  • Loading branch information
rezatorabi committed Sep 5, 2023
1 parent 8ae4edd commit 310af09
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/99106.yaml
@@ -0,0 +1,6 @@
pr: 99106
summary: "Add support for Persian language stemmer"
area: Analysis
type: feature
issues:
- 98911
Expand Up @@ -214,6 +214,9 @@ Norwegian (Nynorsk)::
{lucene-analysis-docs}/no/NorwegianLightStemmer.html[*`light_nynorsk`*],
{lucene-analysis-docs}/no/NorwegianMinimalStemmer.html[`minimal_nynorsk`]
Persian::
{lucene-analysis-docs}/fa/PersianStemmer.html[*`persian`*]
Portuguese::
https://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*],
pass:macros[http://www.inf.ufrgs.br/~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`\]],
Expand Down
Expand Up @@ -322,6 +322,7 @@ public TokenStream create(TokenStream tokenStream) {
filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("persian_stem", PersianStemTokenFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put(
"predicate_token_filter",
Expand Down
@@ -0,0 +1,28 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory {

PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new PersianStemFilter(tokenStream);
}
}
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
Expand Down Expand Up @@ -213,6 +214,10 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);

// Persian stemmers
} else if ("persian".equalsIgnoreCase(language)) {
return new PersianStemFilter(tokenStream);

// Portuguese stemmers
} else if ("portuguese".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new PortugueseStemmer());
Expand Down
Expand Up @@ -117,6 +117,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
filters.put("persiannormalization", PersianNormalizationFilterFactory.class);
filters.put("persianstem", PersianStemTokenFilterFactory.class);
filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class);
filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
Expand Down

0 comments on commit 310af09

Please sign in to comment.