From 310af09b6a83d4dba816e0b13af2ef2ecf6447f8 Mon Sep 17 00:00:00 2001 From: Reza Torabi Date: Tue, 5 Sep 2023 14:43:27 +0330 Subject: [PATCH] Add persian language stemmer (#99106) --- docs/changelog/99106.yaml | 6 ++++ .../tokenfilters/stemmer-tokenfilter.asciidoc | 3 ++ .../analysis/common/CommonAnalysisPlugin.java | 1 + .../common/PersianStemTokenFilterFactory.java | 28 +++++++++++++++++++ .../common/StemmerTokenFilterFactory.java | 5 ++++ .../common/CommonAnalysisFactoryTests.java | 1 + 6 files changed, 44 insertions(+) create mode 100644 docs/changelog/99106.yaml create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianStemTokenFilterFactory.java diff --git a/docs/changelog/99106.yaml b/docs/changelog/99106.yaml new file mode 100644 index 0000000000000..21cb121595d2b --- /dev/null +++ b/docs/changelog/99106.yaml @@ -0,0 +1,6 @@ +pr: 99106 +summary: "Add support for Persian language stemmer" +area: Analysis +type: feature +issues: + - 98911 diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index b88c5de0b8185..162164e12872d 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -214,6 +214,9 @@ Norwegian (Nynorsk):: {lucene-analysis-docs}/no/NorwegianLightStemmer.html[*`light_nynorsk`*], {lucene-analysis-docs}/no/NorwegianMinimalStemmer.html[`minimal_nynorsk`] +Persian:: +{lucene-analysis-docs}/fa/PersianStemmer.html[*`persian`*] + Portuguese:: https://dl.acm.org/citation.cfm?id=1141523&dl=ACM&coll=DL&CFID=179095584&CFTOKEN=80067181[*`light_portuguese`*], pass:macros[http://www.inf.ufrgs.br/~buriol/papers/Orengo_CLEF07.pdf[`minimal_portuguese`\]], diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index c4f8915811aee..c6104e92b0b3e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -322,6 +322,7 @@ public TokenStream create(TokenStream tokenStream) { filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("persian_stem", PersianStemTokenFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); filters.put( "predicate_token_filter", diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianStemTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianStemTokenFilterFactory.java new file mode 100644 index 0000000000000..4fcf3fe896fbd --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianStemTokenFilterFactory.java @@ -0,0 +1,28 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fa.PersianStemFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory { + + PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new PersianStemFilter(tokenStream); + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index 4ef2f837368c9..8f9a882e29d2a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.es.SpanishLightStemFilter; +import org.apache.lucene.analysis.fa.PersianStemFilter; import org.apache.lucene.analysis.fi.FinnishLightStemFilter; import org.apache.lucene.analysis.fr.FrenchLightStemFilter; import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; @@ -213,6 +214,10 @@ public TokenStream create(TokenStream tokenStream) { } else if ("minimal_nynorsk".equalsIgnoreCase(language) || "minimalNynorsk".equalsIgnoreCase(language)) { return new NorwegianMinimalStemFilter(tokenStream, NorwegianLightStemmer.NYNORSK); + // Persian stemmers + } else if ("persian".equalsIgnoreCase(language)) { + return new PersianStemFilter(tokenStream); + // Portuguese stemmers } else if ("portuguese".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PortugueseStemmer()); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 777349ee81c93..f147cb47a2c01 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -117,6 +117,7 @@ protected Map> getTokenFilters() { filters.put("hindinormalization", HindiNormalizationFilterFactory.class); filters.put("indicnormalization", IndicNormalizationFilterFactory.class); filters.put("persiannormalization", PersianNormalizationFilterFactory.class); + filters.put("persianstem", PersianStemTokenFilterFactory.class); filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class); filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class); filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);