Skip to content

Commit

Permalink
Ukrainian language plugin can fill up heap (#71998)
Browse files Browse the repository at this point in the history
The lucene Ukrainian analyzer has a bug where a large in-memory
dictionary is loaded and stored on a thread local for every tokenstream
generated in a new thread (for more details see
https://issues.apache.org/jira/browse/LUCENE-9930). Due to checks
added in #50908, we create a tokenstream for every registered
analyzer in every shard, which means that any node with the ukrainian
plugin installed will leak one copy of this dictionary per shard,
whether or not the ukrainian analyzer is actually being used.

This commit makes the plugin use a fixed version of the
UkrainianMorfologikAnalyzer, until we merge a version of lucene that
contains the upstream fix.
  • Loading branch information
romseygeek committed Apr 21, 2021
1 parent 81017be commit d6038a3
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*@notice
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.uk;

import morfologik.stemming.Dictionary;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.morfologik.MorfologikFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.SuppressForbidden;

import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;

/**
* A dictionary-based {@link Analyzer} for Ukrainian.
*
* Modified from lucene 8.8.0 sources to incorporate a bugfix for
* https://issues.apache.org/jira/browse/LUCENE-9930
*/
public final class XUkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;

/** File containing default Ukrainian stopwords. */
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";

/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}

/**
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
@SuppressForbidden(reason="Lucene uses IOUtils")
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static final Dictionary DICTIONARY;

static {
try {
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(UkrainianMorfologikAnalyzer.class,
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8));
DICTIONARY = Dictionary.read(
UkrainianMorfologikAnalyzer.class.getClassLoader().getResource("ua/net/nlp/ukrainian.dict"));
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load resources", ex);
}
}
}

/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public XUkrainianMorfologikAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}

/**
* Builds an analyzer with the given stop words.
*
* @param stopwords a stopword set
*/
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}

/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before
* stemming.
*
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}

@Override
protected Reader initReader(String fieldName, Reader reader) {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// different apostrophes
builder.add("\u2019", "'");
builder.add("\u2018", "'");
builder.add("\u02BC", "'");
builder.add("`", "'");
builder.add("´", "'");
// ignored characters
builder.add("\u0301", "");
builder.add("\u00AD", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");

NormalizeCharMap normMap = builder.build();
reader = new MappingCharFilter(normMap, reader);
return reader;
}

/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link MorfologikFilter} on the Ukrainian dictionary.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new StopFilter(result, stopwords);

if (stemExclusionSet.isEmpty() == false) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}

result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY);
return new TokenStreamComponents(source, result);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,27 @@

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<UkrainianMorfologikAnalyzer> {
public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider<XUkrainianMorfologikAnalyzer> {

private final UkrainianMorfologikAnalyzer analyzer;
private final XUkrainianMorfologikAnalyzer analyzer;

public UkrainianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new UkrainianMorfologikAnalyzer(
analyzer = new XUkrainianMorfologikAnalyzer(
Analysis.parseStopWords(env, settings, UkrainianMorfologikAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
);
analyzer.setVersion(version);
}

@Override
public UkrainianMorfologikAnalyzer get() {
public XUkrainianMorfologikAnalyzer get() {
return this.analyzer;
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer;
import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.plugin.analysis.ukrainian.AnalysisUkrainianPlugin;
Expand All @@ -27,6 +27,6 @@ public void testDefaultsUkranianAnalysis() throws IOException {
new AnalysisUkrainianPlugin());

Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer();
MatcherAssert.assertThat(analyzer, instanceOf(UkrainianMorfologikAnalyzer.class));
MatcherAssert.assertThat(analyzer, instanceOf(XUkrainianMorfologikAnalyzer.class));
}
}

0 comments on commit d6038a3

Please sign in to comment.