Skip to content

Commit

Permalink
Synonym Graph Support (LUCENE-6664) (#21517)
Browse files Browse the repository at this point in the history
Integrate the patch from LUCENE-6664 into elasticsearch and
add support for handling a graph token stream in match/multi-match
queries.

This fixes longstanding bugs with multi-token synonyms returning
incorrect results with proximity queries.
  • Loading branch information
mattweber authored and Michael McCandless committed Nov 28, 2016
1 parent 8affb7c commit 04e07bc
Show file tree
Hide file tree
Showing 17 changed files with 3,209 additions and 29 deletions.
@@ -0,0 +1,291 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.synonym;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Creates a list of {@link TokenStream} where each stream is the tokens that make up a finite string in graph token stream. To do this,
* the graph token stream is converted to an {@link Automaton} and from there we use a {@link FiniteStringsIterator} to collect the various
* token streams for each finite string.
*/
public class GraphTokenStreamFiniteStrings {
private final Automaton.Builder builder;
Automaton det;
private final Map<BytesRef, Integer> termToID = new HashMap<>();
private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
private int anyTermID = -1;

public GraphTokenStreamFiniteStrings() {
this.builder = new Automaton.Builder();
}

private static class BytesRefArrayTokenStream extends TokenStream {
private final BytesTermAttribute termAtt = addAttribute(BytesTermAttribute.class);
private final BytesRef[] terms;
private int offset;

BytesRefArrayTokenStream(BytesRef[] terms) {
this.terms = terms;
offset = 0;
}

@Override
public boolean incrementToken() throws IOException {
if (offset < terms.length) {
clearAttributes();
termAtt.setBytesRef(terms[offset]);
offset = offset + 1;
return true;
}

return false;
}
}

/**
* Gets
*/
public List<TokenStream> getTokenStreams(final TokenStream in) throws IOException {
// build automation
build(in);

List<TokenStream> tokenStreams = new ArrayList<>();
final FiniteStringsIterator finiteStrings = new FiniteStringsIterator(det);
for (IntsRef string; (string = finiteStrings.next()) != null; ) {
final BytesRef[] tokens = new BytesRef[string.length];
for (int idx = string.offset, len = string.offset + string.length; idx < len; idx++) {
tokens[idx - string.offset] = idToTerm.get(string.ints[idx]);
}

tokenStreams.add(new BytesRefArrayTokenStream(tokens));
}

return tokenStreams;
}

private void build(final TokenStream in) throws IOException {
if (det != null) {
throw new IllegalStateException("Automation already built");
}

final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

in.reset();

int pos = -1;
int lastPos = 0;
int maxOffset = 0;
int maxPos = -1;
int state = -1;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
assert pos > -1 || posInc > 0;

if (posInc > 1) {
throw new IllegalArgumentException("cannot handle holes; to accept any term, use '*' term");
}

if (posInc > 0) {
// New node:
pos += posInc;
}

int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = createState();
}

BytesRef term = termBytesAtt.getBytesRef();
//System.out.println(pos + "-" + endPos + ": " + term.utf8ToString() + ": posInc=" + posInc);
if (term.length == 1 && term.bytes[term.offset] == (byte) '*') {
addAnyTransition(pos, endPos);
} else {
addTransition(pos, endPos, term);
}

maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
maxPos = Math.max(maxPos, endPos);
}

in.end();

// TODO: look at endOffset? ts2a did...

// TODO: this (setting "last" state as the only accept state) may be too simplistic?
setAccept(state, true);
finish();
}

/**
* Returns a new state; state 0 is always the initial state.
*/
private int createState() {
return builder.createState();
}

/**
* Marks the specified state as accept or not.
*/
private void setAccept(int state, boolean accept) {
builder.setAccept(state, accept);
}

/**
* Adds a transition to the automaton.
*/
private void addTransition(int source, int dest, String term) {
addTransition(source, dest, new BytesRef(term));
}

/**
* Adds a transition to the automaton.
*/
private void addTransition(int source, int dest, BytesRef term) {
if (term == null) {
throw new NullPointerException("term should not be null");
}
builder.addTransition(source, dest, getTermID(term));
}

/**
* Adds a transition matching any term.
*/
private void addAnyTransition(int source, int dest) {
builder.addTransition(source, dest, getTermID(null));
}

/**
* Call this once you are done adding states/transitions.
*/
private void finish() {
finish(DEFAULT_MAX_DETERMINIZED_STATES);
}

/**
* Call this once you are done adding states/transitions.
*
* @param maxDeterminizedStates Maximum number of states created when determinizing the automaton. Higher numbers allow this operation
* to consume more memory but allow more complex automatons.
*/
private void finish(int maxDeterminizedStates) {
Automaton automaton = builder.finish();

// System.out.println("before det:\n" + automaton.toDot());

Transition t = new Transition();

// TODO: should we add "eps back to initial node" for all states,
// and det that? then we don't need to revisit initial node at
// every position? but automaton could blow up? And, this makes it
// harder to skip useless positions at search time?

if (anyTermID != -1) {

// Make sure there are no leading or trailing ANY:
int count = automaton.initTransition(0, t);
for (int i = 0; i < count; i++) {
automaton.getNextTransition(t);
if (anyTermID >= t.min && anyTermID <= t.max) {
throw new IllegalStateException("automaton cannot lead with an ANY transition");
}
}

int numStates = automaton.getNumStates();
for (int i = 0; i < numStates; i++) {
count = automaton.initTransition(i, t);
for (int j = 0; j < count; j++) {
automaton.getNextTransition(t);
if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) {
throw new IllegalStateException("automaton cannot end with an ANY transition");
}
}
}

int termCount = termToID.size();

// We have to carefully translate these transitions so automaton
// realizes they also match all other terms:
Automaton newAutomaton = new Automaton();
for (int i = 0; i < numStates; i++) {
newAutomaton.createState();
newAutomaton.setAccept(i, automaton.isAccept(i));
}

for (int i = 0; i < numStates; i++) {
count = automaton.initTransition(i, t);
for (int j = 0; j < count; j++) {
automaton.getNextTransition(t);
int min, max;
if (t.min <= anyTermID && anyTermID <= t.max) {
// Match any term
min = 0;
max = termCount - 1;
} else {
min = t.min;
max = t.max;
}
newAutomaton.addTransition(t.source, t.dest, min, max);
}
}
newAutomaton.finishState();
automaton = newAutomaton;
}

det = Operations.removeDeadStates(Operations.determinize(automaton, maxDeterminizedStates));
}

private int getTermID(BytesRef term) {
Integer id = termToID.get(term);
if (id == null) {
id = termToID.size();
if (term != null) {
term = BytesRef.deepCopyOf(term);
}
termToID.put(term, id);
idToTerm.put(id, term);
if (term == null) {
anyTermID = id;
}
}

return id;
}
}

0 comments on commit 04e07bc

Please sign in to comment.