src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java

/*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.search.suggest.completion;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMap.Builder;
import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.FilterAtomicReader.FilterTerms;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.IOContext.Context;
import org.apache.lucene.store.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.ElasticSearchIllegalStateException;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.suggest.completion.CompletionTokenStream.ToFiniteStrings;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;

/**
 * This {@link PostingsFormat} is basically a T-Sink for a default postings
 * format that is used to store postings on disk fitting the lucene APIs and
 * builds a suggest FST as an auxiliary data structure next to the actual
 * postings format. It uses the delegate postings format for simplicity to
 * handle all the merge operations. The auxiliary suggest FST data structure is
 * only loaded if a FieldsProducer is requested for reading, for merging it uses
 * the low memory delegate postings format.
 */
public class Completion090PostingsFormat extends PostingsFormat {

    public static final String CODEC_NAME = "completion090";
    public static final int SUGGEST_CODEC_VERSION = 1;
    public static final String EXTENSION = "cmp";

    private final static ESLogger logger = Loggers.getLogger(Completion090PostingsFormat.class);
    private PostingsFormat delegatePostingsFormat;
    private final static Map<String, CompletionLookupProvider> providers;
    private CompletionLookupProvider writeProvider;


    static {
        final CompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, false);
        final Builder<String, CompletionLookupProvider> builder = ImmutableMap.builder();
        providers = builder.put(provider.getName(), provider).build();
    }

    public Completion090PostingsFormat(PostingsFormat delegatePostingsFormat, CompletionLookupProvider provider) {
        super(CODEC_NAME);
        this.delegatePostingsFormat = delegatePostingsFormat;
        this.writeProvider = provider;
        assert delegatePostingsFormat != null && writeProvider != null;
    }

    /*
     * Used only by core Lucene at read-time via Service Provider instantiation
     * do not use at Write-time in application code.
     */
    public Completion090PostingsFormat() {
        super(CODEC_NAME);
    }

    @Override
    public CompletionFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
        if (delegatePostingsFormat == null) {
            throw new UnsupportedOperationException("Error - " + getClass().getName()
                    + " has been constructed without a choice of PostingsFormat");
        }
        assert writeProvider != null;
        return new CompletionFieldsConsumer(state);
    }

    @Override
    public CompletionFieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
        return new CompletionFieldsProducer(state);
    }

    private class CompletionFieldsConsumer extends FieldsConsumer {

        private FieldsConsumer delegatesFieldsConsumer;
        private FieldsConsumer suggestFieldsConsumer;

        public CompletionFieldsConsumer(SegmentWriteState state) throws IOException {
            this.delegatesFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
            String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
            IndexOutput output = null;
            boolean success = false;
            try {
                output = state.directory.createOutput(suggestFSTFile, state.context);
                CodecUtil.writeHeader(output, CODEC_NAME, SUGGEST_CODEC_VERSION);
                /*
                 * we write the delegate postings format name so we can load it
                 * without getting an instance in the ctor
                 */
                output.writeString(delegatePostingsFormat.getName());
                output.writeString(writeProvider.getName());
                this.suggestFieldsConsumer = writeProvider.consumer(output);
                success = true;
            } finally {
                if (!success) {
                    IOUtils.closeWhileHandlingException(output);
                }
            }
        }

        @Override
        public TermsConsumer addField(final FieldInfo field) throws IOException {
            final TermsConsumer delegateConsumer = delegatesFieldsConsumer.addField(field);
            final TermsConsumer suggestTermConsumer = suggestFieldsConsumer.addField(field);
            final GroupedPostingsConsumer groupedPostingsConsumer = new GroupedPostingsConsumer(delegateConsumer, suggestTermConsumer);

            return new TermsConsumer() {
                @Override
                public PostingsConsumer startTerm(BytesRef text) throws IOException {
                    groupedPostingsConsumer.startTerm(text);
                    return groupedPostingsConsumer;
                }

                @Override
                public Comparator<BytesRef> getComparator() throws IOException {
                    return delegateConsumer.getComparator();
                }

                @Override
                public void finishTerm(BytesRef text, TermStats stats) throws IOException {
                    suggestTermConsumer.finishTerm(text, stats);
                    delegateConsumer.finishTerm(text, stats);
                }

                @Override
                public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
                    suggestTermConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
                    delegateConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
                }
            };
        }

        @Override
        public void close() throws IOException {
            IOUtils.close(delegatesFieldsConsumer, suggestFieldsConsumer);
        }
    }

    private class GroupedPostingsConsumer extends PostingsConsumer {

        private TermsConsumer[] termsConsumers;
        private PostingsConsumer[] postingsConsumers;

        public GroupedPostingsConsumer(TermsConsumer... termsConsumersArgs) {
            termsConsumers = termsConsumersArgs;
            postingsConsumers = new PostingsConsumer[termsConsumersArgs.length];
        }

        @Override
        public void startDoc(int docID, int freq) throws IOException {
            for (PostingsConsumer postingsConsumer : postingsConsumers) {
                postingsConsumer.startDoc(docID, freq);
            }
        }

        @Override
        public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
            for (PostingsConsumer postingsConsumer : postingsConsumers) {
                postingsConsumer.addPosition(position, payload, startOffset, endOffset);
            }
        }

        @Override
        public void finishDoc() throws IOException {
            for (PostingsConsumer postingsConsumer : postingsConsumers) {
                postingsConsumer.finishDoc();
            }
        }

        public void startTerm(BytesRef text) throws IOException {
            for (int i = 0; i < termsConsumers.length; i++) {
                postingsConsumers[i] = termsConsumers[i].startTerm(text);
            }
        }
    }

    private static class CompletionFieldsProducer extends FieldsProducer {

        private FieldsProducer delegateProducer;
        private LookupFactory lookupFactory;

        public CompletionFieldsProducer(SegmentReadState state) throws IOException {
            String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION);
            IndexInput input = state.directory.openInput(suggestFSTFile, state.context);
            CodecUtil.checkHeader(input, CODEC_NAME, SUGGEST_CODEC_VERSION, SUGGEST_CODEC_VERSION);
            boolean success = false;
            try {
                PostingsFormat delegatePostingsFormat = PostingsFormat.forName(input.readString());
                String providerName = input.readString();
                CompletionLookupProvider completionLookupProvider = providers.get(providerName);
                if (completionLookupProvider == null) {
                    throw new ElasticSearchIllegalStateException("no provider with name [" + providerName + "] registered");
                }
                // TODO: we could clone the ReadState and make it always forward IOContext.MERGE to prevent unecessary heap usage? 
                this.delegateProducer = delegatePostingsFormat.fieldsProducer(state);
                /*
                 * If we are merging we don't load the FSTs at all such that we
                 * don't consume so much memory during merge
                 */
                if (state.context.context != Context.MERGE) {
                    // TODO: maybe we can do this in a fully lazy fashion based on some configuration
                    // eventually we should have some kind of curciut breaker that prevents us from going OOM here
                    // with some configuration
                    this.lookupFactory = completionLookupProvider.load(input);
                }
                success = true;
            } finally {
                if (!success) {
                    IOUtils.closeWhileHandlingException(delegateProducer, input);
                } else {
                    IOUtils.close(input);
                }
            }
        }

        @Override
        public void close() throws IOException {
            IOUtils.close(delegateProducer);
        }

        @Override
        public Iterator<String> iterator() {
            return delegateProducer.iterator();
        }

        @Override
        public Terms terms(String field) throws IOException {
            Terms terms = delegateProducer.terms(field);
            if (terms == null) {
                return terms;
            }
            return new CompletionTerms(terms, this.lookupFactory);
        }

        @Override
        public int size() {
            return delegateProducer.size();
        }

        @Override
        public long ramBytesUsed() {
            return RamUsageEstimator.sizeOf(lookupFactory) + delegateProducer.ramBytesUsed();
        }
    }

    public static final class CompletionTerms extends FilterTerms {
        private final LookupFactory lookup;

        public CompletionTerms(Terms delegate, LookupFactory lookup) {
            super(delegate);
            this.lookup = lookup;
        }

        public Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext) {
            return lookup.getLookup(mapper, suggestionContext);
        }

        public CompletionStats stats(String ... fields) {
            return lookup.stats(fields);
        }
    }

    public static abstract class CompletionLookupProvider implements PayloadProcessor, ToFiniteStrings {

        public static final char UNIT_SEPARATOR = '\u001f';

        public abstract FieldsConsumer consumer(IndexOutput output) throws IOException;

        public abstract String getName();

        public abstract LookupFactory load(IndexInput input) throws IOException;

        @Override
        public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException {
            if (weight < -1 || weight > Integer.MAX_VALUE) {
                throw new IllegalArgumentException("weight must be >= -1 && <= Integer.MAX_VALUE");
            }
            for (int i = 0; i < surfaceForm.length; i++) {
                if (surfaceForm.bytes[i] == UNIT_SEPARATOR) {
                    throw new IllegalArgumentException(
                            "surface form cannot contain unit separator character U+001F; this character is reserved");
                }
            }
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream);
            output.writeVLong(weight + 1);
            output.writeVInt(surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeVInt(payload.length);
            output.writeBytes(payload.bytes, 0, payload.length);

            output.close();
            return new BytesRef(byteArrayOutputStream.toByteArray());
        }

        @Override
        public void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length);
            InputStreamDataInput input = new InputStreamDataInput(byteArrayInputStream);
            ref.weight = input.readVLong() - 1;
            int len = input.readVInt();
            ref.surfaceForm.grow(len);
            ref.surfaceForm.length = len;
            input.readBytes(ref.surfaceForm.bytes, ref.surfaceForm.offset, ref.surfaceForm.length);
            len = input.readVInt();
            ref.payload.grow(len);
            ref.payload.length = len;
            input.readBytes(ref.payload.bytes, ref.payload.offset, ref.payload.length);
            input.close();
        }
    }

    public CompletionStats completionStats(IndexReader indexReader, String ... fields) {
        CompletionStats completionStats = new CompletionStats();
        for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
            AtomicReader atomicReader = atomicReaderContext.reader();
            try {
                for (String fieldName : atomicReader.fields()) {
                    Terms terms = atomicReader.fields().terms(fieldName);
                    if (terms instanceof CompletionTerms) {
                        CompletionTerms completionTerms = (CompletionTerms) terms;
                        completionStats.add(completionTerms.stats(fields));
                    }
                }
            } catch (IOException e) {
                logger.error("Could not get completion stats: {}", e, e.getMessage());
            }
        }

        return completionStats;
    }

    public static abstract class LookupFactory {
        public abstract Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext);
        public abstract CompletionStats stats(String ... fields);
        abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper);
    }
}