Skip to content

Commit

Permalink
Fix beidermorse phonetic token filter for unspecified languageset (#…
Browse files Browse the repository at this point in the history
…27112)

Currently, when we create a BeiderMorseFilter with an unspecified `languageset`,
the filter will not guess the language, which should be the default behaviour.
This change fixes this and adds a simple test for the cases with and without
provided `languageset` settings.

Closes #26771
  • Loading branch information
cbuescher committed Oct 27, 2017
1 parent f972116 commit 3b37b79
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 9 deletions.
Expand Up @@ -19,9 +19,6 @@

package org.elasticsearch.index.analysis;

import java.util.HashSet;
import java.util.List;

import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.Caverphone1;
import org.apache.commons.codec.language.Caverphone2;
Expand All @@ -45,6 +42,9 @@
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
import org.elasticsearch.index.analysis.phonetic.Nysiis;

import java.util.HashSet;
import java.util.List;

public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {

private final Encoder encoder;
Expand Down Expand Up @@ -116,11 +116,11 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
public TokenStream create(TokenStream tokenStream) {
if (encoder == null) {
if (ruletype != null && nametype != null) {
if (languageset != null) {
final LanguageSet languages = LanguageSet.from(new HashSet<>(languageset));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
LanguageSet langset = null;
if (languageset != null && languageset.size() > 0) {
langset = LanguageSet.from(new HashSet<>(languageset));
}
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset);
}
if (maxcodelength > 0) {
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
Expand Down
Expand Up @@ -19,26 +19,57 @@

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
import org.elasticsearch.test.ESTestCase;
import org.hamcrest.MatcherAssert;
import org.junit.Before;

import java.io.IOException;
import java.io.StringReader;

import static org.hamcrest.Matchers.instanceOf;

public class SimplePhoneticAnalysisTests extends ESTestCase {
public void testPhoneticTokenFilterFactory() throws IOException {

private TestAnalysis analysis;

@Before
public void setup() throws IOException {
String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml";
Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
}

public void testPhoneticTokenFilterFactory() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic");
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
}

public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("ABADIAS"));
String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
"abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
"obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("Rimbault"));
String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
}
Expand Up @@ -19,6 +19,10 @@ index:
beidermorsefilter:
type: phonetic
encoder: beidermorse
beidermorsefilterfrench:
type: phonetic
encoder: beidermorse
languageset : [ "french" ]
koelnerphonetikfilter:
type: phonetic
encoder: koelnerphonetik
Expand Down

0 comments on commit 3b37b79

Please sign in to comment.