Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Persian language stemmer #99106

Merged
merged 1 commit into from Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/99106.yaml
@@ -0,0 +1,6 @@
pr: 99106
summary: "Add support for Persian language stemmer"
area: Analysis
type: feature
issues:
- 98911
Expand Up @@ -214,6 +214,9 @@ Norwegian (Nynorsk)::
{lucene-analysis-docs}/no/NorwegianLightStemmer.html[*`light_nynorsk`*],
{lucene-analysis-docs}/no/NorwegianMinimalStemmer.html[`minimal_nynorsk`]

Persian::
{lucene-analysis-docs}/fa/PersianStemmer.html[*`persian`*]

Portuguese::
https://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*],
pass:macros[http://www.inf.ufrgs.br/~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`\]],
Expand Down
Expand Up @@ -322,6 +322,7 @@ public TokenStream create(TokenStream tokenStream) {
filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("persian_stem", PersianStemTokenFilterFactory::new);
filters.put("porter_stem", PorterStemTokenFilterFactory::new);
filters.put(
"predicate_token_filter",
Expand Down
@@ -0,0 +1,28 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory {

PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new PersianStemFilter(tokenStream);
}
}
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
import org.apache.lucene.analysis.fa.PersianStemFilter;
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
Expand Down Expand Up @@ -213,6 +214,10 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) {
return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK);

// Persian stemmers
} else if ("persian".equalsIgnoreCase(language)) {
return new PersianStemFilter(tokenStream);

// Portuguese stemmers
} else if ("portuguese".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new PortugueseStemmer());
Expand Down
Expand Up @@ -117,6 +117,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
filters.put("persiannormalization", PersianNormalizationFilterFactory.class);
filters.put("persianstem", PersianStemTokenFilterFactory.class);
filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class);
filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
Expand Down