Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Updated Analyzing/Fuzzysuggester from lucene trunk
* Minor alignments (like setter to ctor)
* FuzzySuggester has a unicode aware flag, which is not exposed in the fuzzy completion request parameters
* Made XAnalyzingSuggester flags (PAYLOAD_SEP, END_BYTE, SEP_LABEL) to be written into the postings format, so we can retain backwards compatibility
* The above change also implies, that these flags can be set per instantiated XAnalyzingSuggester
* CompletionPostingsFormatTest now uses a randomProvider for writing data to check for bwc
  • Loading branch information
spinscale committed Nov 26, 2013
1 parent 9f5d01c commit bf74f49
Show file tree
Hide file tree
Showing 11 changed files with 605 additions and 80 deletions.
5 changes: 5 additions & 0 deletions docs/reference/search/suggesters/completion-suggest.asciidoc
Expand Up @@ -218,6 +218,11 @@ The following parameters are supported:
Minimum length of the input, which is not
checked for fuzzy alternatives, defaults to `1`

`unicode_aware`::
Sets all are measurements (like edit distance,
transpositions and lengths) in unicode code points
(actual letters) instead of bytes.

NOTE: If you want to stick with the default values, but
still use fuzzy, you can either use `fuzzy: {}`
or `fuzzy: true`.

Large diffs are not rendered by default.

Expand Up @@ -19,6 +19,7 @@
package org.apache.lucene.search.suggest.analyzing;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.*;
Expand Down Expand Up @@ -48,6 +49,9 @@
* #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
* edited. We allow up to 1 (@link
* #DEFAULT_MAX_EDITS} edit.
* If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
* minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
* points (actual letters) instead of bytes.*
*
* <p>
* NOTE: This suggester does not boost suggestions that
Expand All @@ -60,12 +64,22 @@
* like synonyms to keep the complexity of the prefix intersection low for good
* lookup performance. At index time, complex analyzers can safely be used.
* </p>
*
* @lucene.experimental
*/
public final class XFuzzySuggester extends XAnalyzingSuggester {
private final int maxEdits;
private final boolean transpositions;
private final int nonFuzzyPrefix;
private final int minFuzzyLength;
private final boolean unicodeAware;

/**
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
* parameters in Unicode code points (actual letters)
* instead of bytes.
*/
public static final boolean DEFAULT_UNICODE_AWARE = false;

/**
* The default minimum length of the key passed to {@link
Expand Down Expand Up @@ -108,7 +122,7 @@ public XFuzzySuggester(Analyzer analyzer) {
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, null, false, 0);
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);

}

Expand All @@ -133,11 +147,15 @@ public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
* Levenshtein algorithm.
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
* @param sepLabel separation label
* @param payloadSep payload separator byte
* @param endByte end byte marker byte
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, fst, hasPayloads, maxAnalyzedPathsForOneInput);
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
Expand All @@ -152,6 +170,7 @@ public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int optio
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.unicodeAware = unicodeAware;
}

@Override
Expand All @@ -170,7 +189,7 @@ protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...

Automaton levA = toLevenshteinAutomata(lookupAutomaton);
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
Expand All @@ -180,6 +199,24 @@ protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths
return FSTUtil.intersectPrefixPaths(levA, fst);
}

@Override
protected Automaton convertAutomaton(Automaton a) {
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
BasicOperations.determinize(utf8automaton);
return utf8automaton;
} else {
return a;
}
}

@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
tsta.setUnicodeArcs(unicodeAware);
return tsta;
}

Automaton toLevenshteinAutomata(Automaton automaton) {
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
Expand All @@ -197,7 +234,7 @@ Automaton toLevenshteinAutomata(Automaton automaton) {
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
Expand Down
Expand Up @@ -55,7 +55,8 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
private static final int MAX_GRAPH_EXPANSIONS = -1;

public static final String CODEC_NAME = "analyzing";
public static final int CODEC_VERSION = 1;
public static final int CODEC_VERSION_START = 1;
public static final int CODEC_VERSION_LATEST = 2;

private boolean preserveSep;
private boolean preservePositionIncrements;
Expand All @@ -73,8 +74,7 @@ public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, null, false, 1);
prototype.setPreservePositionIncrements(preservePositionIncrements);
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
}

@Override
Expand All @@ -84,7 +84,7 @@ public String getName() {

@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
return new FieldsConsumer() {
private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();

Expand All @@ -111,7 +111,7 @@ public void close() throws IOException {
public TermsConsumer addField(final FieldInfo field) throws IOException {

return new TermsConsumer() {
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads);
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProvider.this, builder);

@Override
Expand Down Expand Up @@ -156,6 +156,9 @@ public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
output.writeVInt(options);
output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
output.writeVInt(XAnalyzingSuggester.END_BYTE);
output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
}
}
};
Expand Down Expand Up @@ -200,7 +203,7 @@ public int getMaxAnalyzedPathsForOneInput() {

@Override
public LookupFactory load(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
input.seek(input.length() - 8);
long metaPointer = input.readLong();
Expand All @@ -225,8 +228,23 @@ public LookupFactory load(IndexInput input) throws IOException {
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst));

// first version did not include these three fields, so fall back to old default (before the analyzingsuggester
// was updated in Lucene, so we cannot use the suggester defaults)
int sepLabel, payloadSep, endByte;
if (version == CODEC_VERSION_START) {
sepLabel = 0xFF;
payloadSep = '\u001f';
endByte = 0x0;
} else {
sepLabel = input.readVInt();
endByte = input.readVInt();
payloadSep = input.readVInt();
}

AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte);
lookupMap.put(entry.getValue(), holder);
}
return new LookupFactory() {
@Override
Expand All @@ -242,17 +260,16 @@ public Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext sugge
suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);

} else {
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
}
suggester.setPreservePositionIncrements(analyzingSuggestHolder.preservePositionIncrements);
return suggester;
}

Expand Down Expand Up @@ -280,6 +297,11 @@ public CompletionStats stats(String... fields) {

return new CompletionStats(sizeInBytes, completionFields);
}

@Override
AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
return lookupMap.get(mapper.names().indexName());
}
};
}

Expand All @@ -291,18 +313,27 @@ static class AnalyzingSuggestHolder {
final boolean hasPayloads;
final int maxAnalyzedPathsForOneInput;
final FST<Pair<Long, BytesRef>> fst;
final int sepLabel;
final int payloadSep;
final int endByte;

public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
}

public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
this.maxGraphExpansions = maxGraphExpansions;
this.hasPayloads = hasPayloads;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.fst = fst;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
}

}

@Override
Expand Down
Expand Up @@ -362,5 +362,6 @@ public CompletionStats completionStats(IndexReader indexReader, String ... field
public static abstract class LookupFactory {
public abstract Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext);
public abstract CompletionStats stats(String ... fields);
abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper);
}
}
Expand Up @@ -68,6 +68,8 @@ public SuggestionSearchContext.SuggestionContext parse(XContentParser parser, Ma
suggestion.setFuzzyMinLength(parser.intValue());
} else if ("prefix_length".equals(fuzzyConfigName) || "prefixLength".equals(fuzzyConfigName)) {
suggestion.setFuzzyPrefixLength(parser.intValue());
} else if ("unicode_aware".equals(fuzzyConfigName) || "unicodeAware".equals(fuzzyConfigName)) {
suggestion.setFuzzyUnicodeAware(parser.booleanValue());
}
}
}
Expand Down
Expand Up @@ -34,6 +34,7 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
private boolean fuzzy = false;
private boolean fuzzyUnicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;

public CompletionSuggestionContext(Suggester suggester) {
super(suggester);
Expand Down Expand Up @@ -86,4 +87,12 @@ public void setFuzzy(boolean fuzzy) {
public boolean isFuzzy() {
return fuzzy;
}

public void setFuzzyUnicodeAware(boolean fuzzyUnicodeAware) {
this.fuzzyUnicodeAware = fuzzyUnicodeAware;
}

public boolean isFuzzyUnicodeAware() {
return fuzzyUnicodeAware;
}
}
Expand Up @@ -38,6 +38,7 @@ public CompletionSuggestionFuzzyBuilder(String name) {
private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS;
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
private boolean unicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;

public int getFuzzyEditDistance() {
return fuzzyEditDistance;
Expand Down Expand Up @@ -75,6 +76,15 @@ public CompletionSuggestionFuzzyBuilder setFuzzyPrefixLength(int fuzzyPrefixLeng
return this;
}

public boolean isUnicodeAware() {
return unicodeAware;
}

public CompletionSuggestionFuzzyBuilder setUnicodeAware(boolean unicodeAware) {
this.unicodeAware = unicodeAware;
return this;
}

@Override
protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
builder.startObject("fuzzy");
Expand All @@ -91,6 +101,9 @@ protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Pa
if (fuzzyPrefixLength != XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX) {
builder.field("prefix_length", fuzzyPrefixLength);
}
if (unicodeAware != XFuzzySuggester.DEFAULT_UNICODE_AWARE) {
builder.field("unicode_aware", unicodeAware);
}

builder.endObject();
return builder;
Expand Down

0 comments on commit bf74f49

Please sign in to comment.