From b55ad98d73b0d89763295e508710f28d240279a2 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 26 Jun 2014 08:18:59 -0400 Subject: [PATCH] Upgrade to Lucene 4.9 (closes #6623) --- core-signatures.txt | 8 - pom.xml | 2 +- .../apache/lucene/document/XStringField.java | 62 -- .../org/apache/lucene/index/XOrdinalMap.java | 306 ++++++ .../analyzing/XAnalyzingSuggester.java | 6 +- .../suggest/analyzing/XFuzzySuggester.java | 2 +- .../suggest/analyzing/XSpecialOperations.java | 200 ---- .../store/BufferedChecksumIndexOutput.java | 14 - .../lucene/store/BufferedIndexOutput.java | 155 +++ .../lucene/store/RateLimitedFSDirectory.java | 11 - .../lucene/store/XNativeFSLockFactory.java | 246 ----- src/main/java/org/elasticsearch/Version.java | 140 +-- .../compress/lzf/LZFCompressedIndexInput.java | 6 + .../elasticsearch/common/lucene/Lucene.java | 72 +- .../common/lucene/SegmentReaderUtils.java | 13 +- .../common/lucene/all/AllField.java | 10 +- .../common/lucene/docset/AllDocIdSet.java | 6 + .../common/lucene/docset/AndDocIdSet.java | 10 + .../common/lucene/docset/DocIdSets.java | 40 +- .../common/lucene/docset/NotDocIdSet.java | 6 + .../common/lucene/docset/OrDocIdSet.java | 10 + .../search/ApplyAcceptedDocsFilter.java | 11 + .../lucene/search/MoreLikeThisQuery.java | 19 +- .../common/lucene/search/XMoreLikeThis.java | 964 ------------------ .../common/util/AbstractArray.java | 2 +- .../common/util/AbstractBigArray.java | 2 +- .../elasticsearch/common/util/BigArray.java | 8 +- .../elasticsearch/common/util/BigArrays.java | 18 +- .../elasticsearch/env/NodeEnvironment.java | 4 +- .../index/analysis/NumericTokenizer.java | 1 + .../WordDelimiterTokenFilterFactory.java | 2 +- .../PerFieldMappingPostingFormatCodec.java | 4 +- .../DiskDocValuesFormatProvider.java | 5 +- .../docvaluesformat/DocValuesFormats.java | 5 +- .../DefaultPostingsFormatProvider.java | 2 +- .../PulsingPostingsFormatProvider.java | 2 +- .../index/fielddata/AtomicFieldData.java | 5 +- .../index/fielddata/IndexFieldDataCache.java | 19 +- .../index/fielddata/RamUsage.java | 30 - .../index/fielddata/ShardFieldData.java | 11 +- .../ordinals/GlobalOrdinalMapping.java | 27 +- .../GlobalOrdinalsIndexFieldData.java | 6 +- .../InternalGlobalOrdinalsBuilder.java | 11 +- .../InternalGlobalOrdinalsIndexFieldData.java | 12 +- .../fielddata/ordinals/MultiOrdinals.java | 3 +- .../index/fielddata/ordinals/Ordinals.java | 7 +- .../ordinals/SinglePackedOrdinals.java | 2 +- .../plain/AbstractGeoPointIndexFieldData.java | 2 +- .../plain/BinaryDVAtomicFieldData.java | 48 +- .../plain/BinaryDVNumericAtomicFieldData.java | 17 +- .../plain/BinaryDVNumericIndexFieldData.java | 3 +- .../plain/BytesBinaryDVAtomicFieldData.java | 9 +- .../plain/BytesBinaryDVIndexFieldData.java | 3 +- .../plain/DoubleArrayAtomicFieldData.java | 14 +- .../plain/DoubleArrayIndexFieldData.java | 8 +- .../plain/FSTBytesAtomicFieldData.java | 6 +- .../plain/FSTBytesIndexFieldData.java | 4 +- .../plain/FloatArrayAtomicFieldData.java | 14 +- .../plain/FloatArrayIndexFieldData.java | 8 +- .../GeoPointBinaryDVAtomicFieldData.java | 9 +- .../plain/GeoPointBinaryDVIndexFieldData.java | 5 +- .../GeoPointCompressedAtomicFieldData.java | 6 +- .../GeoPointCompressedIndexFieldData.java | 4 +- .../GeoPointDoubleArrayAtomicFieldData.java | 12 +- .../GeoPointDoubleArrayIndexFieldData.java | 4 +- .../fielddata/plain/IndexIndexFieldData.java | 2 +- .../plain/NumericDVAtomicFieldData.java | 18 +- .../plain/PackedArrayAtomicFieldData.java | 14 +- .../plain/PackedArrayIndexFieldData.java | 6 +- .../plain/PagedBytesAtomicFieldData.java | 4 +- .../plain/PagedBytesIndexFieldData.java | 11 +- .../plain/ParentChildAtomicFieldData.java | 4 +- .../plain/ParentChildIndexFieldData.java | 4 +- .../plain/SortedSetDVAtomicFieldData.java | 81 +- .../index/mapper/core/BooleanFieldMapper.java | 3 +- .../index/mapper/core/ByteFieldMapper.java | 2 +- .../mapper/core/CompletionFieldMapper.java | 8 +- .../index/mapper/core/DoubleFieldMapper.java | 2 +- .../index/mapper/core/FloatFieldMapper.java | 2 +- .../index/mapper/core/IntegerFieldMapper.java | 2 +- .../index/mapper/core/LongFieldMapper.java | 2 +- .../index/mapper/core/NumberFieldMapper.java | 2 +- .../index/mapper/core/ShortFieldMapper.java | 2 +- .../index/mapper/core/StringFieldMapper.java | 6 +- .../index/mapper/geo/GeoPointFieldMapper.java | 3 +- .../internal/FieldNamesFieldMapper.java | 3 +- .../index/mapper/internal/IdFieldMapper.java | 3 +- .../mapper/internal/IndexFieldMapper.java | 3 +- .../mapper/internal/ParentFieldMapper.java | 5 +- .../mapper/internal/RoutingFieldMapper.java | 3 +- .../mapper/internal/TypeFieldMapper.java | 3 +- .../index/mapper/internal/UidFieldMapper.java | 5 +- .../index/mapper/object/ObjectMapper.java | 6 +- .../policy/ElasticsearchMergePolicy.java | 32 +- .../LogByteSizeMergePolicyProvider.java | 8 - .../policy/TieredMergePolicyProvider.java | 8 - .../index/query/ScriptFilterParser.java | 5 - .../search/NumericRangeFieldDataFilter.java | 10 - .../index/search/geo/GeoDistanceFilter.java | 5 - .../search/geo/GeoDistanceRangeFilter.java | 5 - .../index/search/geo/GeoPolygonFilter.java | 5 - .../geo/InMemoryGeoBoundingBoxFilter.java | 10 - .../org/elasticsearch/index/store/Store.java | 26 - .../index/store/fs/FsDirectoryService.java | 3 +- .../index/translog/Translog.java | 8 +- .../index/translog/fs/FsTranslog.java | 2 +- .../analysis/PreBuiltTokenFilters.java | 2 +- .../cache/IndicesFieldDataCache.java | 36 +- .../cache/IndicesFieldDataCacheListener.java | 4 +- .../MultiDocumentPercolatorIndex.java | 4 +- .../SingleDocumentPercolatorIndex.java | 4 +- .../SourceScoreOrderFragmentsBuilder.java | 16 +- .../SourceSimpleFragmentsBuilder.java | 3 +- .../AnalyzingCompletionLookupProvider.java | 6 +- .../XPostingsHighlighterTests.java | 1 + .../fielddata/LongFieldDataBenchmark.java | 2 +- .../common/lucene/LuceneTest.java | 8 +- .../lucene/index/FreqTermsEnumTests.java | 2 +- .../common/lucene/uid/VersionsTests.java | 2 +- .../common/util/BigArraysTests.java | 8 +- .../elasticsearch/index/codec/CodecTests.java | 25 +- .../DefaultPostingsFormatTests.java | 4 +- .../fielddata/AbstractFieldDataImplTests.java | 10 +- .../AbstractStringFieldDataTests.java | 7 +- .../fielddata/ParentChildFieldDataTests.java | 2 +- .../mapper/all/SimpleAllMapperTests.java | 15 +- .../mapper/date/SimpleDateMappingTests.java | 10 +- .../mapper/numeric/SimpleNumericTests.java | 2 +- .../index/mapper/size/SizeMappingTests.java | 5 +- .../timestamp/TimestampMappingTests.java | 2 +- .../index/mapper/ttl/TTLMappingTests.java | 3 +- .../warmer/SimpleIndicesWarmerTests.java | 5 +- .../AnalyzingCompletionLookupProviderV1.java | 6 +- .../CompletionPostingsFormatTest.java | 5 +- .../test/cache/recycler/MockBigArrays.java | 4 +- 135 files changed, 1025 insertions(+), 2197 deletions(-) delete mode 100644 src/main/java/org/apache/lucene/document/XStringField.java create mode 100644 src/main/java/org/apache/lucene/index/XOrdinalMap.java delete mode 100644 src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java create mode 100644 src/main/java/org/apache/lucene/store/BufferedIndexOutput.java delete mode 100644 src/main/java/org/apache/lucene/store/XNativeFSLockFactory.java delete mode 100644 src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java delete mode 100644 src/main/java/org/elasticsearch/index/fielddata/RamUsage.java diff --git a/core-signatures.txt b/core-signatures.txt index a3fde46e8f238..7835b1fd04310 100644 --- a/core-signatures.txt +++ b/core-signatures.txt @@ -18,8 +18,6 @@ java.util.Collections#sort(java.util.List,java.util.Comparator) java.io.StringReader#(java.lang.String) @ Use FastStringReader instead -org.apache.lucene.util.RamUsageEstimator#sizeOf(java.lang.Object) @ This can be a perfromance trap - @defaultMessage Reference management is tricky, leave it to SearcherManager org.apache.lucene.index.IndexReader#decRef() org.apache.lucene.index.IndexReader#incRef() @@ -55,9 +53,3 @@ java.lang.Math#abs(long) @defaultMessage Use Long.compare instead we are on Java7 com.google.common.primitives.Longs#compare(long,long) - -@defaultMessage we have an optimized XStringField to reduce analysis creation overhead -org.apache.lucene.document.Field#(java.lang.String,java.lang.String,org.apache.lucene.document.FieldType) - -@defaultMessage Use XNativeFSLockFactory instead of the buggy NativeFSLockFactory see LUCENE-5738 - remove once Lucene 4.9 is released -org.apache.lucene.store.NativeFSLockFactory diff --git a/pom.xml b/pom.xml index d3ca5767e870a..863a40ee64239 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ - 4.8.1 + 4.9.0 auto true onerror diff --git a/src/main/java/org/apache/lucene/document/XStringField.java b/src/main/java/org/apache/lucene/document/XStringField.java deleted file mode 100644 index 7a562c7dfbe12..0000000000000 --- a/src/main/java/org/apache/lucene/document/XStringField.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.lucene.document; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.util.CloseableThreadLocal; - -import java.io.IOException; - -/** - * A string/text field that optimizes the case for non analyzed fields to reuse a thread local token - * stream (instead of creating it each time). This reduces analysis chain overhead and object creation - * (which is significant, yay Attributes). - *

- * Not to be confused with Lucene StringField, this handles analyzed text as well, and relies on providing - * the FieldType. Couldn't come up with a good name for this that is different from Text/String... - */ -public class XStringField extends Field { - - private static final CloseableThreadLocal NOT_ANALYZED_TOKENSTREAM = new CloseableThreadLocal() { - @Override - protected StringTokenStream initialValue() { - return new StringTokenStream(); - } - }; - - public XStringField(String name, String value, FieldType fieldType) { - super(name, fieldType); - fieldsData = value; - } - - @Override - public TokenStream tokenStream(Analyzer analyzer) throws IOException { - if (!fieldType().indexed()) { - return null; - } - // Only use the cached TokenStream if the value is indexed and not-tokenized - if (fieldType().tokenized()) { - return super.tokenStream(analyzer); - } - StringTokenStream nonAnalyzedTokenStream = NOT_ANALYZED_TOKENSTREAM.get(); - nonAnalyzedTokenStream.setValue((String) fieldsData); - return nonAnalyzedTokenStream; - } -} diff --git a/src/main/java/org/apache/lucene/index/XOrdinalMap.java b/src/main/java/org/apache/lucene/index/XOrdinalMap.java new file mode 100644 index 0000000000000..0c5e68efc975a --- /dev/null +++ b/src/main/java/org/apache/lucene/index/XOrdinalMap.java @@ -0,0 +1,306 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; +import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.packed.AppendingPackedLongBuffer; +import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; +import org.apache.lucene.util.packed.PackedInts; + +/** maps per-segment ordinals to/from global ordinal space */ +// TODO: we could also have a utility method to merge Terms[] and use size() as a weight when we need it +// TODO: use more efficient packed ints structures? +// TODO: pull this out? its pretty generic (maps between N ord()-enabled TermsEnums) +public class XOrdinalMap implements Accountable { + +static { + assert org.elasticsearch.Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_4_9: "Remove this code once we upgrade to Lucene 4.10 (LUCENE-5780, LUCENE-5782)"; +} + + private static class SegmentMap implements Accountable { + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SegmentMap.class); + + /** Build a map from an index into a sorted view of `weights` to an index into `weights`. */ + private static int[] map(final long[] weights) { + final int[] newToOld = new int[weights.length]; + for (int i = 0; i < weights.length; ++i) { + newToOld[i] = i; + } + new InPlaceMergeSorter() { + @Override + protected void swap(int i, int j) { + final int tmp = newToOld[i]; + newToOld[i] = newToOld[j]; + newToOld[j] = tmp; + } + @Override + protected int compare(int i, int j) { + // j first since we actually want higher weights first + return Long.compare(weights[newToOld[j]], weights[newToOld[i]]); + } + }.sort(0, weights.length); + return newToOld; + } + + /** Inverse the map. */ + private static int[] inverse(int[] map) { + final int[] inverse = new int[map.length]; + for (int i = 0; i < map.length; ++i) { + inverse[map[i]] = i; + } + return inverse; + } + + private final int[] newToOld, oldToNew; + + SegmentMap(long[] weights) { + newToOld = map(weights); + oldToNew = inverse(newToOld); + assert Arrays.equals(newToOld, inverse(oldToNew)); + } + + int newToOld(int segment) { + return newToOld[segment]; + } + + int oldToNew(int segment) { + return oldToNew[segment]; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(newToOld) + RamUsageEstimator.sizeOf(oldToNew); + } + + } + + /** + * Create an ordinal map that uses the number of unique values of each + * {@link SortedDocValues} instance as a weight. + * @see #build(Object, TermsEnum[], long[], float) + */ + public static XOrdinalMap build(Object owner, SortedDocValues[] values, float acceptableOverheadRatio) throws IOException { + final TermsEnum[] subs = new TermsEnum[values.length]; + final long[] weights = new long[values.length]; + for (int i = 0; i < values.length; ++i) { + subs[i] = values[i].termsEnum(); + weights[i] = values[i].getValueCount(); + } + return build(owner, subs, weights, acceptableOverheadRatio); + } + + /** + * Create an ordinal map that uses the number of unique values of each + * {@link SortedSetDocValues} instance as a weight. + * @see #build(Object, TermsEnum[], long[], float) + */ + public static XOrdinalMap build(Object owner, SortedSetDocValues[] values, float acceptableOverheadRatio) throws IOException { + final TermsEnum[] subs = new TermsEnum[values.length]; + final long[] weights = new long[values.length]; + for (int i = 0; i < values.length; ++i) { + subs[i] = values[i].termsEnum(); + weights[i] = values[i].getValueCount(); + } + return build(owner, subs, weights, acceptableOverheadRatio); + } + + /** + * Creates an ordinal map that allows mapping ords to/from a merged + * space from subs. + * @param owner a cache key + * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need + * not be dense (e.g. can be FilteredTermsEnums}. + * @param weights a weight for each sub. This is ideally correlated with + * the number of unique terms that each sub introduces compared + * to the other subs + * @throws IOException if an I/O error occurred. + */ + public static XOrdinalMap build(Object owner, TermsEnum subs[], long[] weights, float acceptableOverheadRatio) throws IOException { + if (subs.length != weights.length) { + throw new IllegalArgumentException("subs and weights must have the same length"); + } + + // enums are not sorted, so let's sort to save memory + final SegmentMap segmentMap = new SegmentMap(weights); + return new XOrdinalMap(owner, subs, segmentMap, acceptableOverheadRatio); + } + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(XOrdinalMap.class); + + // cache key of whoever asked for this awful thing + final Object owner; + // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term + final MonotonicAppendingLongBuffer globalOrdDeltas; + // globalOrd -> first segment container + final AppendingPackedLongBuffer firstSegments; + // for every segment, segmentOrd -> globalOrd + final LongValues segmentToGlobalOrds[]; + // the map from/to segment ids + final SegmentMap segmentMap; + // ram usage + final long ramBytesUsed; + + XOrdinalMap(Object owner, TermsEnum subs[], SegmentMap segmentMap, float acceptableOverheadRatio) throws IOException { + // create the ordinal mappings by pulling a termsenum over each sub's + // unique terms, and walking a multitermsenum over those + this.owner = owner; + this.segmentMap = segmentMap; + // even though we accept an overhead ratio, we keep these ones with COMPACT + // since they are only used to resolve values given a global ord, which is + // slow anyway + globalOrdDeltas = new MonotonicAppendingLongBuffer(PackedInts.COMPACT); + firstSegments = new AppendingPackedLongBuffer(PackedInts.COMPACT); + final MonotonicAppendingLongBuffer[] ordDeltas = new MonotonicAppendingLongBuffer[subs.length]; + for (int i = 0; i < ordDeltas.length; i++) { + ordDeltas[i] = new MonotonicAppendingLongBuffer(acceptableOverheadRatio); + } + long[] ordDeltaBits = new long[subs.length]; + long segmentOrds[] = new long[subs.length]; + ReaderSlice slices[] = new ReaderSlice[subs.length]; + TermsEnumIndex indexes[] = new TermsEnumIndex[slices.length]; + for (int i = 0; i < slices.length; i++) { + slices[i] = new ReaderSlice(0, 0, i); + indexes[i] = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i); + } + MultiTermsEnum mte = new MultiTermsEnum(slices); + mte.reset(indexes); + long globalOrd = 0; + while (mte.next() != null) { + TermsEnumWithSlice matches[] = mte.getMatchArray(); + int firstSegmentIndex = Integer.MAX_VALUE; + long globalOrdDelta = Long.MAX_VALUE; + for (int i = 0; i < mte.getMatchCount(); i++) { + int segmentIndex = matches[i].index; + long segmentOrd = matches[i].terms.ord(); + long delta = globalOrd - segmentOrd; + // We compute the least segment where the term occurs. In case the + // first segment contains most (or better all) values, this will + // help save significant memory + if (segmentIndex < firstSegmentIndex) { + firstSegmentIndex = segmentIndex; + globalOrdDelta = delta; + } + // for each per-segment ord, map it back to the global term. + while (segmentOrds[segmentIndex] <= segmentOrd) { + ordDeltaBits[segmentIndex] |= delta; + ordDeltas[segmentIndex].add(delta); + segmentOrds[segmentIndex]++; + } + } + // for each unique term, just mark the first segment index/delta where it occurs + assert firstSegmentIndex < segmentOrds.length; + firstSegments.add(firstSegmentIndex); + globalOrdDeltas.add(globalOrdDelta); + globalOrd++; + } + firstSegments.freeze(); + globalOrdDeltas.freeze(); + for (int i = 0; i < ordDeltas.length; ++i) { + ordDeltas[i].freeze(); + } + // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster + segmentToGlobalOrds = new LongValues[subs.length]; + long ramBytesUsed = BASE_RAM_BYTES_USED + globalOrdDeltas.ramBytesUsed() + + firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds) + + segmentMap.ramBytesUsed(); + for (int i = 0; i < ordDeltas.length; ++i) { + final MonotonicAppendingLongBuffer deltas = ordDeltas[i]; + if (ordDeltaBits[i] == 0L) { + // segment ords perfectly match global ordinals + // likely in case of low cardinalities and large segments + segmentToGlobalOrds[i] = LongValues.IDENTITY; + } else { + final int bitsRequired = ordDeltaBits[i] < 0 ? 64 : PackedInts.bitsRequired(ordDeltaBits[i]); + final long monotonicBits = deltas.ramBytesUsed() * 8; + final long packedBits = bitsRequired * deltas.size(); + if (deltas.size() <= Integer.MAX_VALUE + && packedBits <= monotonicBits * (1 + acceptableOverheadRatio)) { + // monotonic compression mostly adds overhead, let's keep the mapping in plain packed ints + final int size = (int) deltas.size(); + final PackedInts.Mutable newDeltas = PackedInts.getMutable(size, bitsRequired, acceptableOverheadRatio); + final MonotonicAppendingLongBuffer.Iterator it = deltas.iterator(); + for (int ord = 0; ord < size; ++ord) { + newDeltas.set(ord, it.next()); + } + assert !it.hasNext(); + segmentToGlobalOrds[i] = new LongValues() { + @Override + public long get(long ord) { + return ord + newDeltas.get((int) ord); + } + }; + ramBytesUsed += newDeltas.ramBytesUsed(); + } else { + segmentToGlobalOrds[i] = new LongValues() { + @Override + public long get(long ord) { + return ord + deltas.get(ord); + } + }; + ramBytesUsed += deltas.ramBytesUsed(); + } + ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]); + } + } + this.ramBytesUsed = ramBytesUsed; + } + + /** + * Given a segment number, return a {@link LongValues} instance that maps + * segment ordinals to global ordinals. + */ + public LongValues getGlobalOrds(int segmentIndex) { + return segmentToGlobalOrds[segmentMap.oldToNew(segmentIndex)]; + } + + /** + * Given global ordinal, returns the ordinal of the first segment which contains + * this ordinal (the corresponding to the segment return {@link #getFirstSegmentNumber}). + */ + public long getFirstSegmentOrd(long globalOrd) { + return globalOrd - globalOrdDeltas.get(globalOrd); + } + + /** + * Given a global ordinal, returns the index of the first + * segment that contains this term. + */ + public int getFirstSegmentNumber(long globalOrd) { + return segmentMap.newToOld((int) firstSegments.get(globalOrd)); + } + + /** + * Returns the total number of unique terms in global ord space. + */ + public long getValueCount() { + return globalOrdDeltas.size(); + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } +} diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java index 2d9e2985a437b..85268cc73acc9 100644 --- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java +++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java @@ -250,8 +250,8 @@ public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyz } /** Returns byte size of the underlying FST. */ - public long sizeInBytes() { - return fst == null ? 0 : fst.sizeInBytes(); + public long ramBytesUsed() { + return fst == null ? 0 : fst.ramBytesUsed(); } private static void copyDestTransitions(State from, State to, List transitions) { @@ -910,7 +910,7 @@ public final Set toFiniteStrings(final TokenStreamToAutomaton ts2a, Tok // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: - return XSpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); } final Automaton toLookupAutomaton(final CharSequence key) throws IOException { diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java index 498da5a6d83e2..c030cd913a57a 100644 --- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java +++ b/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java @@ -219,7 +219,7 @@ public TokenStreamToAutomaton getTokenStreamToAutomaton() { } Automaton toLevenshteinAutomata(Automaton automaton) { - final Set ref = XSpecialOperations.getFiniteStrings(automaton, -1); + final Set ref = SpecialOperations.getFiniteStrings(automaton, -1); Automaton subs[] = new Automaton[ref.size()]; int upto = 0; for (IntsRef path : ref) { diff --git a/src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java b/src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java deleted file mode 100644 index 6f52b85c39652..0000000000000 --- a/src/main/java/org/apache/lucene/search/suggest/analyzing/XSpecialOperations.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.lucene.search.suggest.analyzing; - -import java.util.Collections; -import java.util.HashSet; -import java.util.IdentityHashMap; -import java.util.Set; - -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.State; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Util; -import org.elasticsearch.Version; - -class XSpecialOperations { - - // TODO Lucene 4.9: remove this once we upgrade; see - // LUCENE-5628 - - static { - assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this code once we upgrade to Lucene 4.9 where LUCENE-5628 is fixed"; - } - - private static class PathNode { - - /** Which state the path node ends on, whose - * transitions we are enumerating. */ - public State state; - - /** Which state the current transition leads to. */ - public State to; - - /** Which transition we are on. */ - public int transition; - - /** Which label we are on, in the min-max range of the - * current Transition */ - public int label; - - public void resetState(State state) { - assert state.numTransitions() != 0; - this.state = state; - transition = 0; - Transition t = state.transitionsArray[transition]; - label = t.getMin(); - to = t.getDest(); - } - - /** Returns next label of current transition, or - * advances to next transition and returns its first - * label, if current one is exhausted. If there are - * no more transitions, returns -1. */ - public int nextLabel() { - if (label > state.transitionsArray[transition].getMax()) { - // We've exhaused the current transition's labels; - // move to next transitions: - transition++; - if (transition >= state.numTransitions()) { - // We're done iterating transitions leaving this state - return -1; - } - Transition t = state.transitionsArray[transition]; - label = t.getMin(); - to = t.getDest(); - } - return label++; - } - } - - private static PathNode getNode(PathNode[] nodes, int index) { - assert index < nodes.length; - if (nodes[index] == null) { - nodes[index] = new PathNode(); - } - return nodes[index]; - } - - // TODO: this is a dangerous method ... Automaton could be - // huge ... and it's better in general for caller to - // enumerate & process in a single walk: - - /** Returns the set of accepted strings, up to at most - * limit strings. If more than limit - * strings are accepted, the first limit strings found are returned. If limit == -1, then - * the limit is infinite. If the {@link Automaton} has - * cycles then this method might throw {@code - * IllegalArgumentException} but that is not guaranteed - * when the limit is set. */ - public static Set getFiniteStrings(Automaton a, int limit) { - Set results = new HashSet<>(); - - if (limit == -1 || limit > 0) { - // OK - } else { - throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit); - } - - if (a.getSingleton() != null) { - // Easy case: automaton accepts only 1 string - results.add(Util.toUTF32(a.getSingleton(), new IntsRef())); - } else { - - if (a.getInitialState().isAccept()) { - // Special case the empty string, as usual: - results.add(new IntsRef()); - } - - if (a.getInitialState().numTransitions() > 0 && (limit == -1 || results.size() < limit)) { - - // TODO: we could use state numbers here and just - // alloc array, but asking for states array can be - // costly (it's lazily computed): - - // Tracks which states are in the current path, for - // cycle detection: - Set pathStates = Collections.newSetFromMap(new IdentityHashMap()); - - // Stack to hold our current state in the - // recursion/iteration: - PathNode[] nodes = new PathNode[4]; - - pathStates.add(a.getInitialState()); - PathNode root = getNode(nodes, 0); - root.resetState(a.getInitialState()); - - IntsRef string = new IntsRef(1); - string.length = 1; - - while (string.length > 0) { - - PathNode node = nodes[string.length-1]; - - // Get next label leaving the current node: - int label = node.nextLabel(); - - if (label != -1) { - string.ints[string.length-1] = label; - - if (node.to.isAccept()) { - // This transition leads to an accept state, - // so we save the current string: - results.add(IntsRef.deepCopyOf(string)); - if (results.size() == limit) { - break; - } - } - - if (node.to.numTransitions() != 0) { - // Now recurse: the destination of this transition has - // outgoing transitions: - if (pathStates.contains(node.to)) { - throw new IllegalArgumentException("automaton has cycles"); - } - pathStates.add(node.to); - - // Push node onto stack: - if (nodes.length == string.length) { - PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(nodes, 0, newNodes, 0, nodes.length); - nodes = newNodes; - } - getNode(nodes, string.length).resetState(node.to); - string.length++; - string.grow(string.length); - } - } else { - // No more transitions leaving this state, - // pop/return back to previous state: - assert pathStates.contains(node.state); - pathStates.remove(node.state); - string.length--; - } - } - } - } - - return results; - } -} diff --git a/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java index 0c64a93bd887e..c3dc07f2f1763 100644 --- a/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java +++ b/src/main/java/org/apache/lucene/store/BufferedChecksumIndexOutput.java @@ -89,25 +89,11 @@ public void flush() throws IOException { } } - @Override - public void seek(long pos) throws IOException { - // seek might be called on files, which means that the checksum is not file checksum - // but a checksum of the bytes written to this stream, which is the same for each - // type of file in lucene - super.seek(pos); - delegate.seek(pos); - } - @Override public long length() throws IOException { return delegate.length(); } - @Override - public void setLength(long length) throws IOException { - delegate.setLength(length); - } - @Override public String toString() { return delegate.toString(); diff --git a/src/main/java/org/apache/lucene/store/BufferedIndexOutput.java b/src/main/java/org/apache/lucene/store/BufferedIndexOutput.java new file mode 100644 index 0000000000000..f2788e643b76e --- /dev/null +++ b/src/main/java/org/apache/lucene/store/BufferedIndexOutput.java @@ -0,0 +1,155 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.store; + +import java.io.IOException; +import java.util.zip.CRC32; + +/** Base implementation class for buffered {@link IndexOutput}. */ +public abstract class BufferedIndexOutput extends IndexOutput { + /** The default buffer size in bytes ({@value #DEFAULT_BUFFER_SIZE}). */ + public static final int DEFAULT_BUFFER_SIZE = 16384; + + private final int bufferSize; + private final byte[] buffer; + private long bufferStart = 0; // position in file of buffer + private int bufferPosition = 0; // position in buffer + private final CRC32 crc = new CRC32(); + + /** + * Creates a new {@link BufferedIndexOutput} with the default buffer size + * ({@value #DEFAULT_BUFFER_SIZE} bytes see {@link #DEFAULT_BUFFER_SIZE}) + */ + public BufferedIndexOutput() { + this(DEFAULT_BUFFER_SIZE); + } + + /** + * Creates a new {@link BufferedIndexOutput} with the given buffer size. + * @param bufferSize the buffer size in bytes used to buffer writes internally. + * @throws IllegalArgumentException if the given buffer size is less or equal to 0 + */ + public BufferedIndexOutput(int bufferSize) { + if (bufferSize <= 0) { + throw new IllegalArgumentException("bufferSize must be greater than 0 (got " + bufferSize + ")"); + } + this.bufferSize = bufferSize; + buffer = new byte[bufferSize]; + } + + @Override + public void writeByte(byte b) throws IOException { + if (bufferPosition >= bufferSize) + flush(); + buffer[bufferPosition++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + int bytesLeft = bufferSize - bufferPosition; + // is there enough space in the buffer? + if (bytesLeft >= length) { + // we add the data to the end of the buffer + System.arraycopy(b, offset, buffer, bufferPosition, length); + bufferPosition += length; + // if the buffer is full, flush it + if (bufferSize - bufferPosition == 0) + flush(); + } else { + // is data larger then buffer? + if (length > bufferSize) { + // we flush the buffer + if (bufferPosition > 0) + flush(); + // and write data at once + crc.update(b, offset, length); + flushBuffer(b, offset, length); + bufferStart += length; + } else { + // we fill/flush the buffer (until the input is written) + int pos = 0; // position in the input data + int pieceLength; + while (pos < length) { + pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft; + System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength); + pos += pieceLength; + bufferPosition += pieceLength; + // if the buffer is full, flush it + bytesLeft = bufferSize - bufferPosition; + if (bytesLeft == 0) { + flush(); + bytesLeft = bufferSize; + } + } + } + } + } + + @Override + public void flush() throws IOException { + crc.update(buffer, 0, bufferPosition); + flushBuffer(buffer, bufferPosition); + bufferStart += bufferPosition; + bufferPosition = 0; + } + + /** Expert: implements buffer write. Writes bytes at the current position in + * the output. + * @param b the bytes to write + * @param len the number of bytes to write + */ + private void flushBuffer(byte[] b, int len) throws IOException { + flushBuffer(b, 0, len); + } + + /** Expert: implements buffer write. Writes bytes at the current position in + * the output. + * @param b the bytes to write + * @param offset the offset in the byte array + * @param len the number of bytes to write + */ + protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException; + + @Override + public void close() throws IOException { + flush(); + } + + @Override + public long getFilePointer() { + return bufferStart + bufferPosition; + } + + @Override + public abstract long length() throws IOException; + + /** + * Returns size of the used output buffer in bytes. + * */ + public final int getBufferSize() { + return bufferSize; + } + + @Override + public long getChecksum() throws IOException { + flush(); + return crc.getValue(); + } +} diff --git a/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java index 2c4bb68cd7f65..30dafb48df578 100644 --- a/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java +++ b/src/main/java/org/apache/lucene/store/RateLimitedFSDirectory.java @@ -117,12 +117,6 @@ public long length() throws IOException { return delegate.length(); } - @Override - public void seek(long pos) throws IOException { - flush(); - delegate.seek(pos); - } - @Override public void flush() throws IOException { try { @@ -132,11 +126,6 @@ public void flush() throws IOException { } } - @Override - public void setLength(long length) throws IOException { - delegate.setLength(length); - } - @Override public void close() throws IOException { try { diff --git a/src/main/java/org/apache/lucene/store/XNativeFSLockFactory.java b/src/main/java/org/apache/lucene/store/XNativeFSLockFactory.java deleted file mode 100644 index 30cd1adf6a3de..0000000000000 --- a/src/main/java/org/apache/lucene/store/XNativeFSLockFactory.java +++ /dev/null @@ -1,246 +0,0 @@ -package org.apache.lucene.store; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.nio.channels.FileChannel; -import java.nio.channels.FileLock; -import java.nio.channels.OverlappingFileLockException; -import java.nio.file.StandardOpenOption; -import java.io.File; -import java.io.IOException; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.util.IOUtils; -import org.elasticsearch.Version; - -/** - *

Implements {@link LockFactory} using native OS file - * locks. Note that because this LockFactory relies on - * java.nio.* APIs for locking, any problems with those APIs - * will cause locking to fail. Specifically, on certain NFS - * environments the java.nio.* locks will fail (the lock can - * incorrectly be double acquired) whereas {@link - * SimpleFSLockFactory} worked perfectly in those same - * environments. For NFS based access to an index, it's - * recommended that you try {@link SimpleFSLockFactory} - * first and work around the one limitation that a lock file - * could be left when the JVM exits abnormally.

- * - *

The primary benefit of {@link XNativeFSLockFactory} is - * that locks (not the lock file itsself) will be properly - * removed (by the OS) if the JVM has an abnormal exit.

- * - *

Note that, unlike {@link SimpleFSLockFactory}, the existence of - * leftover lock files in the filesystem is fine because the OS - * will free the locks held against these files even though the - * files still remain. Lucene will never actively remove the lock - * files, so although you see them, the index may not be locked.

- * - *

Special care needs to be taken if you change the locking - * implementation: First be certain that no writer is in fact - * writing to the index otherwise you can easily corrupt - * your index. Be sure to do the LockFactory change on all Lucene - * instances and clean up all leftover lock files before starting - * the new configuration for the first time. Different implementations - * can not work together!

- * - *

If you suspect that this or any other LockFactory is - * not working properly in your environment, you can easily - * test it by using {@link VerifyingLockFactory}, {@link - * LockVerifyServer} and {@link LockStressTest}.

- * - * @see LockFactory - */ - -public class XNativeFSLockFactory extends FSLockFactory { - - static { - assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48 : "Remove this class in Lucene 4.9"; - } - - /** - * Create a XNativeFSLockFactory instance, with null (unset) - * lock directory. When you pass this factory to a {@link FSDirectory} - * subclass, the lock directory is automatically set to the - * directory itself. Be sure to create one instance for each directory - * your create! - */ - public XNativeFSLockFactory() { - this((File) null); - } - - /** - * Create a XNativeFSLockFactory instance, storing lock - * files into the specified lockDirName: - * - * @param lockDirName where lock files are created. - */ - public XNativeFSLockFactory(String lockDirName) { - this(new File(lockDirName)); - } - - /** - * Create a XNativeFSLockFactory instance, storing lock - * files into the specified lockDir: - * - * @param lockDir where lock files are created. - */ - public XNativeFSLockFactory(File lockDir) { - setLockDir(lockDir); - } - - @Override - public synchronized Lock makeLock(String lockName) { - if (lockPrefix != null) - lockName = lockPrefix + "-" + lockName; - return new NativeFSLock(lockDir, lockName); - } - - @Override - public void clearLock(String lockName) throws IOException { - makeLock(lockName).close(); - } -} - -class NativeFSLock extends Lock { - - private FileChannel channel; - private FileLock lock; - private File path; - private File lockDir; - private static final Set LOCK_HELD = Collections.synchronizedSet(new HashSet()); - - - public NativeFSLock(File lockDir, String lockFileName) { - this.lockDir = lockDir; - path = new File(lockDir, lockFileName); - } - - - @Override - public synchronized boolean obtain() throws IOException { - - if (lock != null) { - // Our instance is already locked: - return false; - } - - // Ensure that lockDir exists and is a directory. - if (!lockDir.exists()) { - if (!lockDir.mkdirs()) - throw new IOException("Cannot create directory: " + - lockDir.getAbsolutePath()); - } else if (!lockDir.isDirectory()) { - // TODO: NoSuchDirectoryException instead? - throw new IOException("Found regular file where directory expected: " + - lockDir.getAbsolutePath()); - } - final String canonicalPath = path.getCanonicalPath(); - // Make sure nobody else in-process has this lock held - // already, and, mark it held if not: - // This is a pretty crazy workaround for some documented - // but yet awkward JVM behavior: - // - // On some systems, closing a channel releases all locks held by the Java virtual machine on the underlying file - // regardless of whether the locks were acquired via that channel or via another channel open on the same file. - // It is strongly recommended that, within a program, a unique channel be used to acquire all locks on any given - // file. - // - // This essentially means if we close "A" channel for a given file all locks might be released... the odd part - // is that we can't re-obtain the lock in the same JVM but from a different process if that happens. Nevertheless - // this is super trappy. See LUCENE-5738 - boolean obtained = false; - if (LOCK_HELD.add(canonicalPath)) { - try { - channel = FileChannel.open(path.toPath(), StandardOpenOption.CREATE, StandardOpenOption.WRITE); - try { - lock = channel.tryLock(); - obtained = lock != null; - } catch (IOException | OverlappingFileLockException e) { - // At least on OS X, we will sometimes get an - // intermittent "Permission Denied" IOException, - // which seems to simply mean "you failed to get - // the lock". But other IOExceptions could be - // "permanent" (eg, locking is not supported via - // the filesystem). So, we record the failure - // reason here; the timeout obtain (usually the - // one calling us) will use this as "root cause" - // if it fails to get the lock. - failureReason = e; - } - } finally { - if (obtained == false) { // not successful - clear up and move out - clearLockHeld(path); - final FileChannel toClose = channel; - channel = null; - IOUtils.closeWhileHandlingException(toClose); - } - } - } - return obtained; - } - - @Override - public synchronized void close() throws IOException { - try { - if (lock != null) { - try { - lock.release(); - lock = null; - } finally { - clearLockHeld(path); - } - } - } finally { - IOUtils.close(channel); - channel = null; - } - } - - private static final void clearLockHeld(File path) throws IOException { - boolean remove = LOCK_HELD.remove(path.getCanonicalPath()); - assert remove : "Lock was cleared but never marked as held"; - } - - @Override - public synchronized boolean isLocked() { - // The test for is isLocked is not directly possible with native file locks: - - // First a shortcut, if a lock reference in this instance is available - if (lock != null) return true; - - // Look if lock file is present; if not, there can definitely be no lock! - if (!path.exists()) return false; - - // Try to obtain and release (if was locked) the lock - try { - boolean obtained = obtain(); - if (obtained) close(); - return !obtained; - } catch (IOException ioe) { - return false; - } - } - - @Override - public String toString() { - return "NativeFSLock@" + path; - } -} diff --git a/src/main/java/org/elasticsearch/Version.java b/src/main/java/org/elasticsearch/Version.java index e70797fa53593..c97a6551eb401 100644 --- a/src/main/java/org/elasticsearch/Version.java +++ b/src/main/java/org/elasticsearch/Version.java @@ -42,152 +42,152 @@ public class Version implements Serializable { // the (internal) format of the id is there so we can easily do after/before checks on the id public static final int V_0_18_0_ID = /*00*/180099; - public static final Version V_0_18_0 = new Version(V_0_18_0_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_0 = new Version(V_0_18_0_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_1_ID = /*00*/180199; - public static final Version V_0_18_1 = new Version(V_0_18_1_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_1 = new Version(V_0_18_1_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_2_ID = /*00*/180299; - public static final Version V_0_18_2 = new Version(V_0_18_2_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_2 = new Version(V_0_18_2_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_3_ID = /*00*/180399; - public static final Version V_0_18_3 = new Version(V_0_18_3_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_3 = new Version(V_0_18_3_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_4_ID = /*00*/180499; - public static final Version V_0_18_4 = new Version(V_0_18_4_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_4 = new Version(V_0_18_4_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_5_ID = /*00*/180599; - public static final Version V_0_18_5 = new Version(V_0_18_5_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_5 = new Version(V_0_18_5_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_6_ID = /*00*/180699; - public static final Version V_0_18_6 = new Version(V_0_18_6_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_6 = new Version(V_0_18_6_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_7_ID = /*00*/180799; - public static final Version V_0_18_7 = new Version(V_0_18_7_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_7 = new Version(V_0_18_7_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_18_8_ID = /*00*/180899; - public static final Version V_0_18_8 = new Version(V_0_18_8_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_18_8 = new Version(V_0_18_8_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_0_RC1_ID = /*00*/190051; - public static final Version V_0_19_0_RC1 = new Version(V_0_19_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_0_RC1 = new Version(V_0_19_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_0_RC2_ID = /*00*/190052; - public static final Version V_0_19_0_RC2 = new Version(V_0_19_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_0_RC2 = new Version(V_0_19_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_0_RC3_ID = /*00*/190053; - public static final Version V_0_19_0_RC3 = new Version(V_0_19_0_RC3_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_0_RC3 = new Version(V_0_19_0_RC3_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_0_ID = /*00*/190099; - public static final Version V_0_19_0 = new Version(V_0_19_0_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_0 = new Version(V_0_19_0_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_1_ID = /*00*/190199; - public static final Version V_0_19_1 = new Version(V_0_19_1_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_1 = new Version(V_0_19_1_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_2_ID = /*00*/190299; - public static final Version V_0_19_2 = new Version(V_0_19_2_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_2 = new Version(V_0_19_2_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_3_ID = /*00*/190399; - public static final Version V_0_19_3 = new Version(V_0_19_3_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_3 = new Version(V_0_19_3_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_4_ID = /*00*/190499; - public static final Version V_0_19_4 = new Version(V_0_19_4_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_4 = new Version(V_0_19_4_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_5_ID = /*00*/190599; - public static final Version V_0_19_5 = new Version(V_0_19_5_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_5 = new Version(V_0_19_5_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_6_ID = /*00*/190699; - public static final Version V_0_19_6 = new Version(V_0_19_6_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_6 = new Version(V_0_19_6_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_7_ID = /*00*/190799; - public static final Version V_0_19_7 = new Version(V_0_19_7_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_7 = new Version(V_0_19_7_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_8_ID = /*00*/190899; - public static final Version V_0_19_8 = new Version(V_0_19_8_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_8 = new Version(V_0_19_8_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_9_ID = /*00*/190999; - public static final Version V_0_19_9 = new Version(V_0_19_9_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_9 = new Version(V_0_19_9_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_10_ID = /*00*/191099; - public static final Version V_0_19_10 = new Version(V_0_19_10_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_10 = new Version(V_0_19_10_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_11_ID = /*00*/191199; - public static final Version V_0_19_11 = new Version(V_0_19_11_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_11 = new Version(V_0_19_11_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_12_ID = /*00*/191299; - public static final Version V_0_19_12 = new Version(V_0_19_12_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_12 = new Version(V_0_19_12_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_19_13_ID = /*00*/191399; - public static final Version V_0_19_13 = new Version(V_0_19_13_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_19_13 = new Version(V_0_19_13_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_0_RC1_ID = /*00*/200051; - public static final Version V_0_20_0_RC1 = new Version(V_0_20_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_0_RC1 = new Version(V_0_20_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_0_ID = /*00*/200099; - public static final Version V_0_20_0 = new Version(V_0_20_0_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_0 = new Version(V_0_20_0_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_1_ID = /*00*/200199; - public static final Version V_0_20_1 = new Version(V_0_20_1_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_1 = new Version(V_0_20_1_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_2_ID = /*00*/200299; - public static final Version V_0_20_2 = new Version(V_0_20_2_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_2 = new Version(V_0_20_2_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_3_ID = /*00*/200399; - public static final Version V_0_20_3 = new Version(V_0_20_3_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_3 = new Version(V_0_20_3_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_4_ID = /*00*/200499; - public static final Version V_0_20_4 = new Version(V_0_20_4_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_4 = new Version(V_0_20_4_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_5_ID = /*00*/200599; - public static final Version V_0_20_5 = new Version(V_0_20_5_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_5 = new Version(V_0_20_5_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_6_ID = /*00*/200699; - public static final Version V_0_20_6 = new Version(V_0_20_6_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_6 = new Version(V_0_20_6_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_20_7_ID = /*00*/200799; - public static final Version V_0_20_7 = new Version(V_0_20_7_ID, false, org.apache.lucene.util.Version.LUCENE_36); + public static final Version V_0_20_7 = new Version(V_0_20_7_ID, false, org.apache.lucene.util.Version.LUCENE_3_6); public static final int V_0_90_0_Beta1_ID = /*00*/900001; - public static final Version V_0_90_0_Beta1 = new Version(V_0_90_0_Beta1_ID, false, org.apache.lucene.util.Version.LUCENE_41); + public static final Version V_0_90_0_Beta1 = new Version(V_0_90_0_Beta1_ID, false, org.apache.lucene.util.Version.LUCENE_4_1); public static final int V_0_90_0_RC1_ID = /*00*/900051; - public static final Version V_0_90_0_RC1 = new Version(V_0_90_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_41); + public static final Version V_0_90_0_RC1 = new Version(V_0_90_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_4_1); public static final int V_0_90_0_RC2_ID = /*00*/900052; - public static final Version V_0_90_0_RC2 = new Version(V_0_90_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_42); + public static final Version V_0_90_0_RC2 = new Version(V_0_90_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_4_2); public static final int V_0_90_0_ID = /*00*/900099; - public static final Version V_0_90_0 = new Version(V_0_90_0_ID, false, org.apache.lucene.util.Version.LUCENE_42); + public static final Version V_0_90_0 = new Version(V_0_90_0_ID, false, org.apache.lucene.util.Version.LUCENE_4_2); public static final int V_0_90_1_ID = /*00*/900199; - public static final Version V_0_90_1 = new Version(V_0_90_1_ID, false, org.apache.lucene.util.Version.LUCENE_43); + public static final Version V_0_90_1 = new Version(V_0_90_1_ID, false, org.apache.lucene.util.Version.LUCENE_4_3); public static final int V_0_90_2_ID = /*00*/900299; - public static final Version V_0_90_2 = new Version(V_0_90_2_ID, false, org.apache.lucene.util.Version.LUCENE_43); + public static final Version V_0_90_2 = new Version(V_0_90_2_ID, false, org.apache.lucene.util.Version.LUCENE_4_3); public static final int V_0_90_3_ID = /*00*/900399; - public static final Version V_0_90_3 = new Version(V_0_90_3_ID, false, org.apache.lucene.util.Version.LUCENE_44); + public static final Version V_0_90_3 = new Version(V_0_90_3_ID, false, org.apache.lucene.util.Version.LUCENE_4_4); public static final int V_0_90_4_ID = /*00*/900499; - public static final Version V_0_90_4 = new Version(V_0_90_4_ID, false, org.apache.lucene.util.Version.LUCENE_44); + public static final Version V_0_90_4 = new Version(V_0_90_4_ID, false, org.apache.lucene.util.Version.LUCENE_4_4); public static final int V_0_90_5_ID = /*00*/900599; - public static final Version V_0_90_5 = new Version(V_0_90_5_ID, false, org.apache.lucene.util.Version.LUCENE_44); + public static final Version V_0_90_5 = new Version(V_0_90_5_ID, false, org.apache.lucene.util.Version.LUCENE_4_4); public static final int V_0_90_6_ID = /*00*/900699; - public static final Version V_0_90_6 = new Version(V_0_90_6_ID, false, org.apache.lucene.util.Version.LUCENE_45); + public static final Version V_0_90_6 = new Version(V_0_90_6_ID, false, org.apache.lucene.util.Version.LUCENE_4_5); public static final int V_0_90_7_ID = /*00*/900799; - public static final Version V_0_90_7 = new Version(V_0_90_7_ID, false, org.apache.lucene.util.Version.LUCENE_45); + public static final Version V_0_90_7 = new Version(V_0_90_7_ID, false, org.apache.lucene.util.Version.LUCENE_4_5); public static final int V_0_90_8_ID = /*00*/900899; - public static final Version V_0_90_8 = new Version(V_0_90_8_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_8 = new Version(V_0_90_8_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_9_ID = /*00*/900999; - public static final Version V_0_90_9 = new Version(V_0_90_9_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_9 = new Version(V_0_90_9_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_10_ID = /*00*/901099; - public static final Version V_0_90_10 = new Version(V_0_90_10_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_10 = new Version(V_0_90_10_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_11_ID = /*00*/901199; - public static final Version V_0_90_11 = new Version(V_0_90_11_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_11 = new Version(V_0_90_11_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_12_ID = /*00*/901299; - public static final Version V_0_90_12 = new Version(V_0_90_12_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_12 = new Version(V_0_90_12_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_13_ID = /*00*/901399; - public static final Version V_0_90_13 = new Version(V_0_90_13_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_13 = new Version(V_0_90_13_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_0_90_14_ID = /*00*/901499; - public static final Version V_0_90_14 = new Version(V_0_90_14_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_0_90_14 = new Version(V_0_90_14_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_0_Beta1_ID = /*00*/1000001; - public static final Version V_1_0_0_Beta1 = new Version(V_1_0_0_Beta1_ID, false, org.apache.lucene.util.Version.LUCENE_45); + public static final Version V_1_0_0_Beta1 = new Version(V_1_0_0_Beta1_ID, false, org.apache.lucene.util.Version.LUCENE_4_5); public static final int V_1_0_0_Beta2_ID = /*00*/1000002; - public static final Version V_1_0_0_Beta2 = new Version(V_1_0_0_Beta2_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_0_Beta2 = new Version(V_1_0_0_Beta2_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_0_RC1_ID = /*00*/1000051; - public static final Version V_1_0_0_RC1 = new Version(V_1_0_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_0_RC1 = new Version(V_1_0_0_RC1_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_0_RC2_ID = /*00*/1000052; - public static final Version V_1_0_0_RC2 = new Version(V_1_0_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_0_RC2 = new Version(V_1_0_0_RC2_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_0_ID = /*00*/1000099; - public static final Version V_1_0_0 = new Version(V_1_0_0_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_0 = new Version(V_1_0_0_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_1_ID = /*00*/1000199; - public static final Version V_1_0_1 = new Version(V_1_0_1_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_1 = new Version(V_1_0_1_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_2_ID = /*00*/1000299; - public static final Version V_1_0_2 = new Version(V_1_0_2_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_2 = new Version(V_1_0_2_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_3_ID = /*00*/1000399; - public static final Version V_1_0_3 = new Version(V_1_0_3_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_3 = new Version(V_1_0_3_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_0_4_ID = /*00*/1000499; - public static final Version V_1_0_4 = new Version(V_1_0_3_ID, false, org.apache.lucene.util.Version.LUCENE_46); + public static final Version V_1_0_4 = new Version(V_1_0_3_ID, false, org.apache.lucene.util.Version.LUCENE_4_6); public static final int V_1_1_0_ID = /*00*/1010099; - public static final Version V_1_1_0 = new Version(V_1_1_0_ID, false, org.apache.lucene.util.Version.LUCENE_47); + public static final Version V_1_1_0 = new Version(V_1_1_0_ID, false, org.apache.lucene.util.Version.LUCENE_4_7); public static final int V_1_1_1_ID = /*00*/1010199; - public static final Version V_1_1_1 = new Version(V_1_1_1_ID, false, org.apache.lucene.util.Version.LUCENE_47); + public static final Version V_1_1_1 = new Version(V_1_1_1_ID, false, org.apache.lucene.util.Version.LUCENE_4_7); public static final int V_1_1_2_ID = /*00*/1010299; - public static final Version V_1_1_2 = new Version(V_1_1_2_ID, false, org.apache.lucene.util.Version.LUCENE_47); + public static final Version V_1_1_2 = new Version(V_1_1_2_ID, false, org.apache.lucene.util.Version.LUCENE_4_7); public static final int V_1_2_0_ID = /*00*/1020099; - public static final Version V_1_2_0 = new Version(V_1_2_0_ID, false, org.apache.lucene.util.Version.LUCENE_48); + public static final Version V_1_2_0 = new Version(V_1_2_0_ID, false, org.apache.lucene.util.Version.LUCENE_4_8); public static final int V_1_2_1_ID = /*00*/1020199; - public static final Version V_1_2_1 = new Version(V_1_2_1_ID, false, org.apache.lucene.util.Version.LUCENE_48); + public static final Version V_1_2_1 = new Version(V_1_2_1_ID, false, org.apache.lucene.util.Version.LUCENE_4_8); public static final int V_1_2_2_ID = /*00*/1020299; - public static final Version V_1_2_2 = new Version(V_1_2_2_ID, false, org.apache.lucene.util.Version.LUCENE_48); + public static final Version V_1_2_2 = new Version(V_1_2_2_ID, false, org.apache.lucene.util.Version.LUCENE_4_8); public static final int V_1_3_0_ID = /*00*/1030099; - public static final Version V_1_3_0 = new Version(V_1_3_0_ID, false, org.apache.lucene.util.Version.LUCENE_48); + public static final Version V_1_3_0 = new Version(V_1_3_0_ID, false, org.apache.lucene.util.Version.LUCENE_4_9); public static final int V_2_0_0_ID = /*00*/2000099; - public static final Version V_2_0_0 = new Version(V_2_0_0_ID, true, org.apache.lucene.util.Version.LUCENE_48); + public static final Version V_2_0_0 = new Version(V_2_0_0_ID, true, org.apache.lucene.util.Version.LUCENE_4_9); public static final Version CURRENT = V_2_0_0; diff --git a/src/main/java/org/elasticsearch/common/compress/lzf/LZFCompressedIndexInput.java b/src/main/java/org/elasticsearch/common/compress/lzf/LZFCompressedIndexInput.java index 6196fbe3a3e1a..326eceb77c4e2 100644 --- a/src/main/java/org/elasticsearch/common/compress/lzf/LZFCompressedIndexInput.java +++ b/src/main/java/org/elasticsearch/common/compress/lzf/LZFCompressedIndexInput.java @@ -21,6 +21,7 @@ import com.ning.compress.lzf.ChunkDecoder; import com.ning.compress.lzf.LZFChunk; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.IndexInput; import org.elasticsearch.common.compress.CompressedIndexInput; import org.elasticsearch.common.lucene.store.InputStreamIndexInput; @@ -71,4 +72,9 @@ public IndexInput clone() { cloned.inputBuffer = new byte[LZFChunk.MAX_CHUNK_LEN]; return cloned; } + + @Override + public IndexInput slice(String description, long offset, long length) throws IOException { + return BufferedIndexInput.wrap(description, this, offset, length); + } } diff --git a/src/main/java/org/elasticsearch/common/lucene/Lucene.java b/src/main/java/org/elasticsearch/common/lucene/Lucene.java index 33d74c4592d0c..1db482c7574e0 100644 --- a/src/main/java/org/elasticsearch/common/lucene/Lucene.java +++ b/src/main/java/org/elasticsearch/common/lucene/Lucene.java @@ -45,7 +45,7 @@ */ public class Lucene { - public static final Version VERSION = Version.LUCENE_48; + public static final Version VERSION = Version.LUCENE_4_9; public static final Version ANALYZER_VERSION = VERSION; public static final Version QUERYPARSER_VERSION = VERSION; @@ -63,56 +63,28 @@ public static Version parseVersion(@Nullable String version, Version defaultVers if (version == null) { return defaultVersion; } - if ("4.8".equals(version)) { - return VERSION.LUCENE_48; + switch(version) { + case "4.9": return VERSION.LUCENE_4_9; + case "4.8": return VERSION.LUCENE_4_8; + case "4.7": return VERSION.LUCENE_4_7; + case "4.6": return VERSION.LUCENE_4_6; + case "4.5": return VERSION.LUCENE_4_5; + case "4.4": return VERSION.LUCENE_4_4; + case "4.3": return VERSION.LUCENE_4_3; + case "4.2": return VERSION.LUCENE_4_2; + case "4.1": return VERSION.LUCENE_4_1; + case "4.0": return VERSION.LUCENE_4_0; + case "3.6": return VERSION.LUCENE_3_6; + case "3.5": return VERSION.LUCENE_3_5; + case "3.4": return VERSION.LUCENE_3_4; + case "3.3": return VERSION.LUCENE_3_3; + case "3.2": return VERSION.LUCENE_3_2; + case "3.1": return VERSION.LUCENE_3_1; + case "3.0": return VERSION.LUCENE_3_0; + default: + logger.warn("no version match {}, default to {}", version, defaultVersion); + return defaultVersion; } - if ("4.7".equals(version)) { - return VERSION.LUCENE_47; - } - if ("4.6".equals(version)) { - return VERSION.LUCENE_46; - } - if ("4.5".equals(version)) { - return VERSION.LUCENE_45; - } - if ("4.4".equals(version)) { - return VERSION.LUCENE_44; - } - if ("4.3".equals(version)) { - return Version.LUCENE_43; - } - if ("4.2".equals(version)) { - return Version.LUCENE_42; - } - if ("4.1".equals(version)) { - return Version.LUCENE_41; - } - if ("4.0".equals(version)) { - return Version.LUCENE_40; - } - if ("3.6".equals(version)) { - return Version.LUCENE_36; - } - if ("3.5".equals(version)) { - return Version.LUCENE_35; - } - if ("3.4".equals(version)) { - return Version.LUCENE_34; - } - if ("3.3".equals(version)) { - return Version.LUCENE_33; - } - if ("3.2".equals(version)) { - return Version.LUCENE_32; - } - if ("3.1".equals(version)) { - return Version.LUCENE_31; - } - if ("3.0".equals(version)) { - return Version.LUCENE_30; - } - logger.warn("no version match {}, default to {}", version, defaultVersion); - return defaultVersion; } /** diff --git a/src/main/java/org/elasticsearch/common/lucene/SegmentReaderUtils.java b/src/main/java/org/elasticsearch/common/lucene/SegmentReaderUtils.java index 3d40d9d8b376e..5efe64fcd2078 100644 --- a/src/main/java/org/elasticsearch/common/lucene/SegmentReaderUtils.java +++ b/src/main/java/org/elasticsearch/common/lucene/SegmentReaderUtils.java @@ -21,7 +21,6 @@ import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.FilterAtomicReader; import org.apache.lucene.index.SegmentReader; -import org.apache.lucene.util.Version; import org.elasticsearch.ElasticsearchIllegalStateException; import org.elasticsearch.common.Nullable; @@ -45,17 +44,9 @@ public static SegmentReader segmentReaderOrNull(AtomicReader reader) { return internalSegmentReader(reader, false); } - static { - assert Version.LUCENE_48.onOrAfter(Lucene.VERSION) : "Use AtomicReader.addCoreClosedListener instead of trying to unwrap the atomic reader: https://issues.apache.org/jira/browse/LUCENE-5701"; - } - public static boolean registerCoreListener(AtomicReader reader, SegmentReader.CoreClosedListener listener) { - SegmentReader segReader = SegmentReaderUtils.segmentReaderOrNull(reader); - if (segReader != null) { - segReader.addCoreClosedListener(listener); - return true; - } - return false; + reader.addCoreClosedListener(listener); + return true; } private static SegmentReader internalSegmentReader(AtomicReader reader, boolean fail) { diff --git a/src/main/java/org/elasticsearch/common/lucene/all/AllField.java b/src/main/java/org/elasticsearch/common/lucene/all/AllField.java index e917b07fa8288..f1bd209eaf29e 100644 --- a/src/main/java/org/elasticsearch/common/lucene/all/AllField.java +++ b/src/main/java/org/elasticsearch/common/lucene/all/AllField.java @@ -25,7 +25,6 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.Version; import java.io.IOException; import java.io.Reader; @@ -63,15 +62,14 @@ public AllEntries getAllEntries() { return allEntries; } - static { - assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Re-use the incoming AllTokenStream once we upgrade to Lucene 4.9"; - } - @Override - public TokenStream tokenStream(Analyzer analyzer) throws IOException { + public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) throws IOException { try { allEntries.reset(); // reset the all entries, just in case it was read already if (allEntries.customBoost() && fieldType().indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { + // TODO: we should be able to reuse "previous" if its instanceof AllTokenStream? + // but we need to be careful this optimization is safe (and tested)... + // AllTokenStream maps boost to 4-byte payloads, so we only need to use it any field had non-default (!= 1.0f) boost and if // positions are indexed: return AllTokenStream.allTokenStream(name, allEntries, analyzer); diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/AllDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/AllDocIdSet.java index 7dad875476958..fe7613879dfd7 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/AllDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/AllDocIdSet.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; @@ -44,6 +45,11 @@ public boolean isCacheable() { return true; } + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_INT; + } + @Override public DocIdSetIterator iterator() throws IOException { return new Iterator(maxDoc); diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java index 385a33cfa10e9..97a1ab8228fb7 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; import java.util.ArrayList; @@ -48,6 +49,15 @@ public boolean isCacheable() { return true; } + @Override + public long ramBytesUsed() { + long ramBytesUsed = RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; + for (DocIdSet set : sets) { + ramBytesUsed += RamUsageEstimator.NUM_BYTES_OBJECT_REF + set.ramBytesUsed(); + } + return ramBytesUsed; + } + @Override public Bits bits() throws IOException { Bits[] bits = new Bits[sets.length]; diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java index e2f417fb3b1a3..69d3ca4f32e59 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java @@ -24,6 +24,7 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.elasticsearch.common.Nullable; import java.io.IOException; @@ -32,19 +33,18 @@ */ public class DocIdSets { + /** + * Return the size of the doc id set, plus a reference to it. + */ public static long sizeInBytes(DocIdSet docIdSet) { - if (docIdSet instanceof FixedBitSet) { - return ((FixedBitSet) docIdSet).getBits().length * 8 + 16; - } - // only for empty ones and unknowns... - return 1; + return RamUsageEstimator.NUM_BYTES_OBJECT_REF + docIdSet.ramBytesUsed(); } /** * Is it an empty {@link DocIdSet}? */ public static boolean isEmpty(@Nullable DocIdSet set) { - return set == null || set == EMPTY_DOCIDSET; + return set == null || set == DocIdSet.EMPTY; } /** @@ -63,16 +63,16 @@ public static boolean isFastIterator(DocIdSet set) { * always either return an empty {@link DocIdSet} or {@link FixedBitSet} but never null. */ public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set) throws IOException { - if (set == null || set == EMPTY_DOCIDSET) { - return EMPTY_DOCIDSET; + if (set == null || set == DocIdSet.EMPTY) { + return DocIdSet.EMPTY; } DocIdSetIterator it = set.iterator(); if (it == null) { - return EMPTY_DOCIDSET; + return DocIdSet.EMPTY; } int doc = it.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { - return EMPTY_DOCIDSET; + return DocIdSet.EMPTY; } if (set instanceof FixedBitSet) { return set; @@ -85,26 +85,6 @@ public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set) } while (doc != DocIdSetIterator.NO_MORE_DOCS); return fixedBitSet; } - - /** An empty {@code DocIdSet} instance */ - protected static final DocIdSet EMPTY_DOCIDSET = new DocIdSet() { - - @Override - public DocIdSetIterator iterator() { - return DocIdSetIterator.empty(); - } - - @Override - public boolean isCacheable() { - return true; - } - - // we explicitly provide no random access, as this filter is 100% sparse and iterator exits faster - @Override - public Bits bits() { - return null; - } - }; /** * Gets a set to bits. diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java index 670f4acd3ead2..04d556e8290ee 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; @@ -43,6 +44,11 @@ public boolean isCacheable() { return set.isCacheable(); } + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_INT + set.ramBytesUsed(); + } + @Override public Bits bits() throws IOException { Bits bits = set.bits(); diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java index 7b675254bde4d..845f038627e27 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java @@ -22,6 +22,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; @@ -46,6 +47,15 @@ public boolean isCacheable() { return true; } + @Override + public long ramBytesUsed() { + long ramBytesUsed = RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; + for (DocIdSet set : sets) { + ramBytesUsed += RamUsageEstimator.NUM_BYTES_OBJECT_REF + set.ramBytesUsed(); + } + return ramBytesUsed; + } + @Override public Bits bits() throws IOException { Bits[] bits = new Bits[sets.length]; diff --git a/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java b/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java index 1ee63adf3bc8d..097584bf0e056 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java @@ -23,6 +23,7 @@ import org.apache.lucene.search.*; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.elasticsearch.common.lucene.docset.DocIdSets; import java.io.IOException; @@ -90,6 +91,11 @@ public boolean isCacheable() { return innerSet.isCacheable(); } + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJECT_REF + innerSet.ramBytesUsed(); + } + @Override public Bits bits() throws IOException { Bits bits = innerSet.bits(); @@ -202,5 +208,10 @@ public Bits bits() throws IOException { public boolean isCacheable() { return delegate.isCacheable(); } + + @Override + public long ramBytesUsed() { + return RamUsageEstimator.NUM_BYTES_OBJECT_REF + delegate.ramBytesUsed(); + } } } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java index bd0909538fccb..c798b5e6f4446 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java @@ -21,6 +21,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queries.mlt.MoreLikeThis; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; @@ -49,14 +50,14 @@ public class MoreLikeThisQuery extends Query { private String[] moreLikeFields; private Analyzer analyzer; private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH; - private int minTermFrequency = XMoreLikeThis.DEFAULT_MIN_TERM_FREQ; - private int maxQueryTerms = XMoreLikeThis.DEFAULT_MAX_QUERY_TERMS; - private Set stopWords = XMoreLikeThis.DEFAULT_STOP_WORDS; - private int minDocFreq = XMoreLikeThis.DEFAULT_MIN_DOC_FREQ; - private int maxDocFreq = XMoreLikeThis.DEFAULT_MAX_DOC_FREQ; - private int minWordLen = XMoreLikeThis.DEFAULT_MIN_WORD_LENGTH; - private int maxWordLen = XMoreLikeThis.DEFAULT_MAX_WORD_LENGTH; - private boolean boostTerms = XMoreLikeThis.DEFAULT_BOOST; + private int minTermFrequency = MoreLikeThis.DEFAULT_MIN_TERM_FREQ; + private int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS; + private Set stopWords = MoreLikeThis.DEFAULT_STOP_WORDS; + private int minDocFreq = MoreLikeThis.DEFAULT_MIN_DOC_FREQ; + private int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ; + private int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH; + private int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH; + private boolean boostTerms = MoreLikeThis.DEFAULT_BOOST; private float boostTermsFactor = 1; @@ -134,7 +135,7 @@ public boolean equals(Object obj) { @Override public Query rewrite(IndexReader reader) throws IOException { - XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); + MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); mlt.setFieldNames(moreLikeFields); mlt.setAnalyzer(analyzer); diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java deleted file mode 100644 index f2314afe3b58d..0000000000000 --- a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java +++ /dev/null @@ -1,964 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/** - * Copyright 2004-2005 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.elasticsearch.common.lucene.search; - -import java.io.*; -import java.util.*; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.*; -import org.apache.lucene.search.similarities.DefaultSimilarity; -import org.apache.lucene.search.similarities.TFIDFSimilarity; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.PriorityQueue; -import org.apache.lucene.util.UnicodeUtil; -import org.elasticsearch.Version; -import org.elasticsearch.common.io.FastStringReader; - -/** - * Generate "more like this" similarity queries. - * Based on this mail: - *
- * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
- * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
- * is usually fast enough.  But looking up the docFreq() of every term in the document is
- * probably too slow.
- * 

- * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much, - * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested - * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically - * reduce the number of terms under consideration. Another heuristic is that terms with a - * high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the - * number of characters, not selecting anything less than, e.g., six or seven characters. - * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms - * that do a pretty good job of characterizing a document. - *

- * It all depends on what you're trying to do. If you're trying to eek out that last percent - * of precision and recall regardless of computational difficulty so that you can win a TREC - * competition, then the techniques I mention above are useless. But if you're trying to - * provide a "more like this" button on a search results page that does a decent job and has - * good performance, such techniques might be useful. - *

- * An efficient, effective "more-like-this" query generator would be a great contribution, if - * anyone's interested. I'd imagine that it would take a Reader or a String (the document's - * text), analyzer Analyzer, and return a set of representative terms using heuristics like those - * above. The frequency and length thresholds could be parameters, etc. - *

- * Doug - *

- *

- *

- *

- *

Initial Usage

- *

- * This class has lots of options to try to make it efficient and flexible. - * The simplest possible usage is as follows. The bold - * fragment is specific to this class. - *

- *

- * 

- * IndexReader ir = ... - * IndexSearcher is = ... - *

- * MoreLikeThis mlt = new MoreLikeThis(ir); - * Reader target = ... // orig source of doc you want to find similarities to - * Query query = mlt.like( target); - *

- * Hits hits = is.search(query); - * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure - * //you ignore the doc if it matches your 'target' document, as it should be similar to itself - *

- *

- *

- * Thus you: - *

    - *
  1. do your normal, Lucene setup for searching, - *
  2. create a MoreLikeThis, - *
  3. get the text of the doc you want to find similarities to - *
  4. then call one of the like() calls to generate a similarity query - *
  5. call the searcher to find the similar docs - *
- *

- *

More Advanced Usage

- *

- * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine - * multiple fields (e.g. body and title) for similarity. - *

- *

- * Depending on the size of your index and the size and makeup of your documents you - * may want to call the other set methods to control how the similarity queries are - * generated: - *

    - *
  • {@link #setMinTermFreq setMinTermFreq(...)} - *
  • {@link #setMinDocFreq setMinDocFreq(...)} - *
  • {@link #setMaxDocFreq setMaxDocFreq(...)} - *
  • {@link #setMaxDocFreqPct setMaxDocFreqPct(...)} - *
  • {@link #setMinWordLen setMinWordLen(...)} - *
  • {@link #setMaxWordLen setMaxWordLen(...)} - *
  • {@link #setMaxQueryTerms setMaxQueryTerms(...)} - *
  • {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)} - *
  • {@link #setStopWords setStopWord(...)} - *
- *

- *


- *
- * Changes: Mark Harwood 29/02/04
- * Some bugfixing, some refactoring, some optimisation.
- * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
- * - bugfix: No significant terms being created for fields with a termvector - because
- * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
- * - refactor: moved common code into isNoiseWord()
- * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
- * 
- */ - -public final class XMoreLikeThis { - - static { - assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this class once we upgrade to Lucene 4.9"; - } - - /** - * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. - * - * @see #getMaxNumTokensParsed - */ - public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; - - /** - * Ignore terms with less than this frequency in the source doc. - * - * @see #getMinTermFreq - * @see #setMinTermFreq - */ - public static final int DEFAULT_MIN_TERM_FREQ = 2; - - /** - * Ignore words which do not occur in at least this many docs. - * - * @see #getMinDocFreq - * @see #setMinDocFreq - */ - public static final int DEFAULT_MIN_DOC_FREQ = 5; - - /** - * Ignore words which occur in more than this many docs. - * - * @see #getMaxDocFreq - * @see #setMaxDocFreq - * @see #setMaxDocFreqPct - */ - public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; - - /** - * Boost terms in query based on score. - * - * @see #isBoost - * @see #setBoost - */ - public static final boolean DEFAULT_BOOST = false; - - /** - * Default field names. Null is used to specify that the field names should be looked - * up at runtime from the provided reader. - */ - public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; - - /** - * Ignore words less than this length or if 0 then this has no effect. - * - * @see #getMinWordLen - * @see #setMinWordLen - */ - public static final int DEFAULT_MIN_WORD_LENGTH = 0; - - /** - * Ignore words greater than this length or if 0 then this has no effect. - * - * @see #getMaxWordLen - * @see #setMaxWordLen - */ - public static final int DEFAULT_MAX_WORD_LENGTH = 0; - - /** - * Default set of stopwords. - * If null means to allow stop words. - * - * @see #setStopWords - * @see #getStopWords - */ - public static final Set DEFAULT_STOP_WORDS = null; - - /** - * Current set of stop words. - */ - private Set stopWords = DEFAULT_STOP_WORDS; - - /** - * Return a Query with no more than this many terms. - * - * @see BooleanQuery#getMaxClauseCount - * @see #getMaxQueryTerms - * @see #setMaxQueryTerms - */ - public static final int DEFAULT_MAX_QUERY_TERMS = 25; - - /** - * Analyzer that will be used to parse the doc. - */ - private Analyzer analyzer = null; - - /** - * Ignore words less frequent that this. - */ - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - - /** - * Ignore words which do not occur in at least this many docs. - */ - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; - - /** - * Ignore words which occur in more than this many docs. - */ - private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; - - /** - * Should we apply a boost to the Query based on the scores? - */ - private boolean boost = DEFAULT_BOOST; - - /** - * Field name we'll analyze. - */ - private String[] fieldNames = DEFAULT_FIELD_NAMES; - - /** - * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - - /** - * Ignore words if less than this len. - */ - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - - /** - * Ignore words if greater than this len. - */ - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - - /** - * Don't return a query longer than this. - */ - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - - /** - * For idf() calculations. - */ - private TFIDFSimilarity similarity;// = new DefaultSimilarity(); - - /** - * IndexReader to use - */ - private final IndexReader ir; - - /** - * Boost factor to use when boosting the terms - */ - private float boostFactor = 1; - - /** - * Returns the boost factor used when boosting terms - * - * @return the boost factor used when boosting terms - * @see #setBoostFactor(float) - */ - public float getBoostFactor() { - return boostFactor; - } - - /** - * Sets the boost factor to use when boosting terms - * - * @see #getBoostFactor() - */ - public void setBoostFactor(float boostFactor) { - this.boostFactor = boostFactor; - } - - /** - * Constructor requiring an IndexReader. - */ - public XMoreLikeThis(IndexReader ir) { - this(ir, new DefaultSimilarity()); - } - - public XMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { - this.ir = ir; - this.similarity = sim; - } - - - public TFIDFSimilarity getSimilarity() { - return similarity; - } - - public void setSimilarity(TFIDFSimilarity similarity) { - this.similarity = similarity; - } - - /** - * Returns an analyzer that will be used to parse source doc with. The default analyzer - * is not set. - * - * @return the analyzer that will be used to parse source doc with. - */ - public Analyzer getAnalyzer() { - return analyzer; - } - - /** - * Sets the analyzer to use. An analyzer is not required for generating a query with the - * {@link #like(int)} method, all other 'like' methods require an analyzer. - * - * @param analyzer the analyzer to use to tokenize text. - */ - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } - - /** - * Returns the frequency below which terms will be ignored in the source doc. The default - * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. - * - * @return the frequency below which terms will be ignored in the source doc. - */ - public int getMinTermFreq() { - return minTermFreq; - } - - /** - * Sets the frequency below which terms will be ignored in the source doc. - * - * @param minTermFreq the frequency below which terms will be ignored in the source doc. - */ - public void setMinTermFreq(int minTermFreq) { - this.minTermFreq = minTermFreq; - } - - /** - * Returns the frequency at which words will be ignored which do not occur in at least this - * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. - * - * @return the frequency at which words will be ignored which do not occur in at least this - * many docs. - */ - public int getMinDocFreq() { - return minDocFreq; - } - - /** - * Sets the frequency at which words will be ignored which do not occur in at least this - * many docs. - * - * @param minDocFreq the frequency at which words will be ignored which do not occur in at - * least this many docs. - */ - public void setMinDocFreq(int minDocFreq) { - this.minDocFreq = minDocFreq; - } - - /** - * Returns the maximum frequency in which words may still appear. - * Words that appear in more than this many docs will be ignored. The default frequency is - * {@link #DEFAULT_MAX_DOC_FREQ}. - * - * @return get the maximum frequency at which words are still allowed, - * words which occur in more docs than this are ignored. - */ - public int getMaxDocFreq() { - return maxDocFreq; - } - - /** - * Set the maximum frequency in which words may still appear. Words that appear - * in more than this many docs will be ignored. - * - * @param maxFreq the maximum count of documents that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreq(int maxFreq) { - this.maxDocFreq = maxFreq; - } - - /** - * Set the maximum percentage in which words may still appear. Words that appear - * in more than this many percent of all docs will be ignored. - * - * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreqPct(int maxPercentage) { - this.maxDocFreq = maxPercentage * ir.numDocs() / 100; - } - - - /** - * Returns whether to boost terms in query based on "score" or not. The default is - * {@link #DEFAULT_BOOST}. - * - * @return whether to boost terms in query based on "score" or not. - * @see #setBoost - */ - public boolean isBoost() { - return boost; - } - - /** - * Sets whether to boost terms in query based on "score" or not. - * - * @param boost true to boost terms in query based on "score", false otherwise. - * @see #isBoost - */ - public void setBoost(boolean boost) { - this.boost = boost; - } - - /** - * Returns the field names that will be used when generating the 'More Like This' query. - * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. - * - * @return the field names that will be used when generating the 'More Like This' query. - */ - public String[] getFieldNames() { - return fieldNames; - } - - /** - * Sets the field names that will be used when generating the 'More Like This' query. - * Set this to null for the field names to be determined at runtime from the IndexReader - * provided in the constructor. - * - * @param fieldNames the field names that will be used when generating the 'More Like This' - * query. - */ - public void setFieldNames(String[] fieldNames) { - this.fieldNames = fieldNames; - } - - /** - * Returns the minimum word length below which words will be ignored. Set this to 0 for no - * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. - * - * @return the minimum word length below which words will be ignored. - */ - public int getMinWordLen() { - return minWordLen; - } - - /** - * Sets the minimum word length below which words will be ignored. - * - * @param minWordLen the minimum word length below which words will be ignored. - */ - public void setMinWordLen(int minWordLen) { - this.minWordLen = minWordLen; - } - - /** - * Returns the maximum word length above which words will be ignored. Set this to 0 for no - * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. - * - * @return the maximum word length above which words will be ignored. - */ - public int getMaxWordLen() { - return maxWordLen; - } - - /** - * Sets the maximum word length above which words will be ignored. - * - * @param maxWordLen the maximum word length above which words will be ignored. - */ - public void setMaxWordLen(int maxWordLen) { - this.maxWordLen = maxWordLen; - } - - /** - * Set the set of stopwords. - * Any word in this set is considered "uninteresting" and ignored. - * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - * - * @param stopWords set of stopwords, if null it means to allow stop words - * @see #getStopWords - */ - public void setStopWords(Set stopWords) { - this.stopWords = stopWords; - } - - /** - * Get the current stop words being used. - * - * @see #setStopWords - */ - public Set getStopWords() { - return stopWords; - } - - - /** - * Returns the maximum number of query terms that will be included in any generated query. - * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. - * - * @return the maximum number of query terms that will be included in any generated query. - */ - public int getMaxQueryTerms() { - return maxQueryTerms; - } - - /** - * Sets the maximum number of query terms that will be included in any generated query. - * - * @param maxQueryTerms the maximum number of query terms that will be included in any - * generated query. - */ - public void setMaxQueryTerms(int maxQueryTerms) { - this.maxQueryTerms = maxQueryTerms; - } - - /** - * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - * @see #DEFAULT_MAX_NUM_TOKENS_PARSED - */ - public int getMaxNumTokensParsed() { - return maxNumTokensParsed; - } - - /** - * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - public void setMaxNumTokensParsed(int i) { - maxNumTokensParsed = i; - } - - - /** - * Return a query that will return docs like the passed lucene document ID. - * - * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. - * @return a query that will return docs like the passed lucene document ID. - */ - public Query like(int docNum) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = MultiFields.getIndexedFields(ir); - fieldNames = fields.toArray(new String[fields.size()]); - } - - return createQuery(retrieveTerms(docNum)); - } - - /** - * Return a query that will return docs like the passed Reader. - * - * @return a query that will return docs like the passed Reader. - */ - @Deprecated - public Query like(Reader r, String fieldName) throws IOException { - return like(fieldName, r); - } - - /** - * Return a query that will return docs like the passed Readers. - * This was added in order to treat multi-value fields. - * - * @return a query that will return docs like the passed Readers. - */ - public Query like(String fieldName, Reader... readers) throws IOException { - Map words = new HashMap<>(); - for (Reader r : readers) { - addTermFrequencies(r, words, fieldName); - } - return createQuery(createQueue(words)); - } - - /** - * Create the More like query from a PriorityQueue - */ - private Query createQuery(PriorityQueue q) { - BooleanQuery query = new BooleanQuery(); - Object cur; - int qterms = 0; - float bestScore = 0; - - while ((cur = q.pop()) != null) { - Object[] ar = (Object[]) cur; - TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); - - if (boost) { - if (qterms == 0) { - bestScore = ((Float) ar[2]); - } - float myScore = ((Float) ar[2]); - - tq.setBoost(boostFactor * myScore / bestScore); - } - - try { - query.add(tq, BooleanClause.Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses ignore) { - break; - } - - qterms++; - if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { - break; - } - } - - return query; - } - - /** - * Create a PriorityQueue from a word->tf map. - * - * @param words a map of words keyed on the word(String) with Int objects as the values. - */ - private PriorityQueue createQueue(Map words) throws IOException { - // have collected all words in doc and their freqs - int numDocs = ir.numDocs(); - FreqQ res = new FreqQ(words.size()); // will order words by score - - for (String word : words.keySet()) { // for every word - int tf = words.get(word).x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) { - continue; // filter out words that don't occur enough times in the source - } - - // go through all the fields and find the largest document frequency - String topField = fieldNames[0]; - int docFreq = 0; - for (String fieldName : fieldNames) { - int freq = ir.docFreq(new Term(fieldName, word)); - topField = (freq > docFreq) ? fieldName : topField; - docFreq = (freq > docFreq) ? freq : docFreq; - } - - if (minDocFreq > 0 && docFreq < minDocFreq) { - continue; // filter out words that don't occur in enough docs - } - - if (docFreq > maxDocFreq) { - continue; // filter out words that occur in too many docs - } - - if (docFreq == 0) { - continue; // index update problem? - } - - float idf = similarity.idf(docFreq, numDocs); - float score = tf * idf; - - // only really need 1st 3 entries, other ones are for troubleshooting - res.insertWithOverflow(new Object[]{word, // the word - topField, // the top field - score, // overall score - idf, // idf - docFreq, // freq in all docs - tf - }); - } - return res; - } - - /** - * Describe the parameters that control how the "more like this" query is formed. - */ - public String describeParams() { - StringBuilder sb = new StringBuilder(); - sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); - sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); - sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); - sb.append("\t").append("fieldNames : "); - String delim = ""; - for (String fieldName : fieldNames) { - sb.append(delim).append(fieldName); - delim = ", "; - } - sb.append("\n"); - sb.append("\t").append("boost : ").append(boost).append("\n"); - sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); - sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); - return sb.toString(); - } - - /** - * Find words for a more-like-this query former. - * - * @param docNum the id of the lucene document from which to find terms - */ - public PriorityQueue retrieveTerms(int docNum) throws IOException { - Map termFreqMap = new HashMap<>(); - for (String fieldName : fieldNames) { - final Fields vectors = ir.getTermVectors(docNum); - final Terms vector; - if (vectors != null) { - vector = vectors.terms(fieldName); - } else { - vector = null; - } - - // field does not store term vector info - if (vector == null) { - Document d = ir.document(docNum); - IndexableField fields[] = d.getFields(fieldName); - for (IndexableField field : fields) { - final String stringValue = field.stringValue(); - if (stringValue != null) { - addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); - } - } - } else { - addTermFrequencies(termFreqMap, vector); - } - } - - return createQueue(termFreqMap); - } - - /** - * Adds terms and frequencies found in vector into the Map termFreqMap - * - * @param termFreqMap a Map of terms and their frequencies - * @param vector List of terms and their frequencies for a doc/field - */ - private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException { - final TermsEnum termsEnum = vector.iterator(null); - final CharsRef spare = new CharsRef(); - BytesRef text; - while((text = termsEnum.next()) != null) { - UnicodeUtil.UTF8toUTF16(text, spare); - final String term = spare.toString(); - if (isNoiseWord(term)) { - continue; - } - final int freq = (int) termsEnum.totalTermFreq(); - - // increment frequency - Int cnt = termFreqMap.get(term); - if (cnt == null) { - cnt = new Int(); - termFreqMap.put(term, cnt); - cnt.x = freq; - } else { - cnt.x += freq; - } - } - } - - /** - * Adds term frequencies found by tokenizing text from reader into the Map words - * - * @param r a source of text to be tokenized - * @param termFreqMap a Map of terms and their frequencies - * @param fieldName Used by analyzer for any special per-field analysis - */ - private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) - throws IOException { - if (analyzer == null) { - throw new UnsupportedOperationException("To use MoreLikeThis without " + - "term vectors, you must provide an Analyzer"); - } - TokenStream ts = analyzer.tokenStream(fieldName, r); - try { - int tokenCount = 0; - // for every token - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String word = termAtt.toString(); - tokenCount++; - if (tokenCount > maxNumTokensParsed) { - break; - } - if (isNoiseWord(word)) { - continue; - } - - // increment frequency - Int cnt = termFreqMap.get(word); - if (cnt == null) { - termFreqMap.put(word, new Int()); - } else { - cnt.x++; - } - } - ts.end(); - } finally { - IOUtils.closeWhileHandlingException(ts); - } - } - - - /** - * determines if the passed term is likely to be of interest in "more like" comparisons - * - * @param term The word being considered - * @return true if should be ignored, false if should be used in further analysis - */ - private boolean isNoiseWord(String term) { - int len = term.length(); - if (minWordLen > 0 && len < minWordLen) { - return true; - } - if (maxWordLen > 0 && len > maxWordLen) { - return true; - } - return stopWords != null && stopWords.contains(term); - } - - - /** - * Find words for a more-like-this query former. - * The result is a priority queue of arrays with one entry for every word in the document. - * Each array has 6 elements. - * The elements are: - *
    - *
  1. The word (String) - *
  2. The top field that this word comes from (String) - *
  3. The score for this word (Float) - *
  4. The IDF value (Float) - *
  5. The frequency of this word in the index (Integer) - *
  6. The frequency of this word in the source document (Integer) - *
- * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. - * This method is exposed so that you can identify the "interesting words" in a document. - * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. - * - * @param r the reader that has the content of the document - * @param fieldName field passed to the analyzer to use when analyzing the content - * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first - * @see #retrieveInterestingTerms - */ - public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { - Map words = new HashMap<>(); - addTermFrequencies(r, words, fieldName); - return createQueue(words); - } - - /** - * @see #retrieveInterestingTerms(java.io.Reader, String) - */ - public String[] retrieveInterestingTerms(int docNum) throws IOException { - ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(docNum); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } - - /** - * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. - * - * @param r the source document - * @param fieldName field passed to analyzer to use when analyzing the content - * @return the most interesting words in the document - * @see #retrieveTerms(java.io.Reader, String) - * @see #setMaxQueryTerms - */ - public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { - ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r, fieldName); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word - } - String[] res = new String[al.size()]; - return al.toArray(res); - } - - /** - * PriorityQueue that orders words by score. - */ - private static class FreqQ extends PriorityQueue { - FreqQ(int s) { - super(s); - } - - @Override - protected boolean lessThan(Object[] aa, Object[] bb) { - Float fa = (Float) aa[2]; - Float fb = (Float) bb[2]; - return fa > fb; - } - } - - /** - * Use for frequencies and to avoid renewing Integers. - */ - private static class Int { - int x; - - Int() { - x = 1; - } - } -} diff --git a/src/main/java/org/elasticsearch/common/util/AbstractArray.java b/src/main/java/org/elasticsearch/common/util/AbstractArray.java index ba13ff392018a..7de3c1c3b7448 100644 --- a/src/main/java/org/elasticsearch/common/util/AbstractArray.java +++ b/src/main/java/org/elasticsearch/common/util/AbstractArray.java @@ -33,7 +33,7 @@ abstract class AbstractArray implements BigArray { @Override public final void close() { - bigArrays.ramBytesUsed.addAndGet(-sizeInBytes()); + bigArrays.ramBytesUsed.addAndGet(-ramBytesUsed()); assert !released : "double release"; released = true; doClose(); diff --git a/src/main/java/org/elasticsearch/common/util/AbstractBigArray.java b/src/main/java/org/elasticsearch/common/util/AbstractBigArray.java index 18e0feaf5fae4..22175387f770e 100644 --- a/src/main/java/org/elasticsearch/common/util/AbstractBigArray.java +++ b/src/main/java/org/elasticsearch/common/util/AbstractBigArray.java @@ -82,7 +82,7 @@ public final long size() { protected abstract int numBytesPerElement(); - public final long sizeInBytes() { + public final long ramBytesUsed() { // rough approximate, we only take into account the size of the values, not the overhead of the array objects return ((long) pageIndex(size - 1) + 1) * pageSize() * numBytesPerElement(); } diff --git a/src/main/java/org/elasticsearch/common/util/BigArray.java b/src/main/java/org/elasticsearch/common/util/BigArray.java index e6949249c7bc5..6fc2b10ed5936 100644 --- a/src/main/java/org/elasticsearch/common/util/BigArray.java +++ b/src/main/java/org/elasticsearch/common/util/BigArray.java @@ -19,17 +19,13 @@ package org.elasticsearch.common.util; +import org.apache.lucene.util.Accountable; import org.elasticsearch.common.lease.Releasable; /** Base abstraction of an array. */ -public interface BigArray extends Releasable { +public interface BigArray extends Releasable, Accountable { /** Return the length of this array. */ public long size(); - /** - * Return an estimated memory usage of this instance. - */ - public long sizeInBytes(); - } diff --git a/src/main/java/org/elasticsearch/common/util/BigArrays.java b/src/main/java/org/elasticsearch/common/util/BigArrays.java index 155523bbbdb2c..5f362368e5ec3 100644 --- a/src/main/java/org/elasticsearch/common/util/BigArrays.java +++ b/src/main/java/org/elasticsearch/common/util/BigArrays.java @@ -118,7 +118,7 @@ private static class ByteArrayWrapper extends AbstractArrayWrapper implements By } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.sizeOf(array); } @@ -169,7 +169,7 @@ private static class IntArrayWrapper extends AbstractArrayWrapper implements Int } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.sizeOf(array); } @@ -212,7 +212,7 @@ private static class LongArrayWrapper extends AbstractArrayWrapper implements Lo } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.sizeOf(array); } @@ -254,7 +254,7 @@ private static class DoubleArrayWrapper extends AbstractArrayWrapper implements } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.sizeOf(array); } @@ -297,7 +297,7 @@ private static class FloatArrayWrapper extends AbstractArrayWrapper implements F } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.sizeOf(array); } @@ -340,7 +340,7 @@ private static class ObjectArrayWrapper extends AbstractArrayWrapper implemen } @Override - public long sizeInBytes() { + public long ramBytesUsed() { return SHALLOW_SIZE + RamUsageEstimator.alignObjectSize(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_OBJECT_REF * size()); } @@ -386,16 +386,16 @@ private void validate(long delta) { } private T resizeInPlace(T array, long newSize) { - final long oldMemSize = array.sizeInBytes(); + final long oldMemSize = array.ramBytesUsed(); array.resize(newSize); - validate(array.sizeInBytes() - oldMemSize); + validate(array.ramBytesUsed() - oldMemSize); return array; } private T validate(T array) { boolean success = false; try { - validate(array.sizeInBytes()); + validate(array.ramBytesUsed()); success = true; } finally { if (!success) { diff --git a/src/main/java/org/elasticsearch/env/NodeEnvironment.java b/src/main/java/org/elasticsearch/env/NodeEnvironment.java index b9c2f0798428b..4a509bdf85cf0 100644 --- a/src/main/java/org/elasticsearch/env/NodeEnvironment.java +++ b/src/main/java/org/elasticsearch/env/NodeEnvironment.java @@ -22,7 +22,7 @@ import com.google.common.collect.Sets; import com.google.common.primitives.Ints; import org.apache.lucene.store.Lock; -import org.apache.lucene.store.XNativeFSLockFactory; +import org.apache.lucene.store.NativeFSLockFactory; import org.apache.lucene.util.IOUtils; import org.elasticsearch.ElasticsearchIllegalStateException; import org.elasticsearch.cluster.node.DiscoveryNode; @@ -78,7 +78,7 @@ public NodeEnvironment(Settings settings, Environment environment) { } logger.trace("obtaining node lock on {} ...", dir.getAbsolutePath()); try { - XNativeFSLockFactory lockFactory = new XNativeFSLockFactory(dir); + NativeFSLockFactory lockFactory = new NativeFSLockFactory(dir); Lock tmpLock = lockFactory.makeLock("node.lock"); boolean obtained = tmpLock.obtain(); if (obtained) { diff --git a/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java index 5b17b4eaaf426..acb9cb47f6096 100644 --- a/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import org.elasticsearch.common.io.Streams; diff --git a/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java index 0a83d663ad9da..148805b83937a 100644 --- a/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java @@ -88,7 +88,7 @@ public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings inde @Override public TokenStream create(TokenStream tokenStream) { - if (version.onOrAfter(Version.LUCENE_48)) { + if (version.onOrAfter(Version.LUCENE_4_8)) { return new WordDelimiterFilter(version, tokenStream, charTypeTable, flags, diff --git a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java index 1d64f7879ce32..88350ee875ecc 100644 --- a/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java +++ b/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java @@ -21,7 +21,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene46.Lucene46Codec; +import org.apache.lucene.codecs.lucene49.Lucene49Codec; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider; import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; @@ -37,7 +37,7 @@ * configured for a specific field the default postings format is used. */ // LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version -public class PerFieldMappingPostingFormatCodec extends Lucene46Codec { +public class PerFieldMappingPostingFormatCodec extends Lucene49Codec { private final ESLogger logger; private final MapperService mapperService; private final PostingsFormat defaultPostingFormat; diff --git a/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DiskDocValuesFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DiskDocValuesFormatProvider.java index d9e1e134ce711..bd2bfebf9dfb5 100644 --- a/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DiskDocValuesFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DiskDocValuesFormatProvider.java @@ -20,7 +20,7 @@ package org.elasticsearch.index.codec.docvaluesformat; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; +import org.apache.lucene.codecs.lucene49.Lucene49DocValuesFormat; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -35,7 +35,8 @@ public class DiskDocValuesFormatProvider extends AbstractDocValuesFormatProvider @Inject public DiskDocValuesFormatProvider(@Assisted String name, @Assisted Settings docValuesFormatSettings) { super(name); - this.docValuesFormat = new DiskDocValuesFormat(); + // TODO: log a warning if someone chooses this? just remove this together and map it to the 4.9 provider? + this.docValuesFormat = new Lucene49DocValuesFormat(); } @Override diff --git a/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DocValuesFormats.java b/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DocValuesFormats.java index 0afcd99f39431..2d9b9a3facbb6 100644 --- a/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DocValuesFormats.java +++ b/src/main/java/org/elasticsearch/index/codec/docvaluesformat/DocValuesFormats.java @@ -38,9 +38,10 @@ public class DocValuesFormats { builtInDocValuesFormatsX.put(name, new PreBuiltDocValuesFormatProvider.Factory(DocValuesFormat.forName(name))); } // LUCENE UPGRADE: update those DVF if necessary - builtInDocValuesFormatsX.put(DocValuesFormatService.DEFAULT_FORMAT, new PreBuiltDocValuesFormatProvider.Factory(DocValuesFormatService.DEFAULT_FORMAT, DocValuesFormat.forName("Lucene45"))); + builtInDocValuesFormatsX.put(DocValuesFormatService.DEFAULT_FORMAT, new PreBuiltDocValuesFormatProvider.Factory(DocValuesFormatService.DEFAULT_FORMAT, DocValuesFormat.forName("Lucene49"))); builtInDocValuesFormatsX.put("memory", new PreBuiltDocValuesFormatProvider.Factory("memory", DocValuesFormat.forName("Memory"))); - builtInDocValuesFormatsX.put("disk", new PreBuiltDocValuesFormatProvider.Factory("disk", DocValuesFormat.forName("Disk"))); + builtInDocValuesFormatsX.put("disk", new PreBuiltDocValuesFormatProvider.Factory("disk", DocValuesFormat.forName("Lucene49"))); + builtInDocValuesFormatsX.put("Disk", new PreBuiltDocValuesFormatProvider.Factory("Disk", DocValuesFormat.forName("Lucene49"))); builtInDocValuesFormats = builtInDocValuesFormatsX.immutableMap(); } diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java index 3d9b7d5e65112..42b2306608a90 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/DefaultPostingsFormatProvider.java @@ -19,8 +19,8 @@ package org.elasticsearch.index.codec.postingsformat; -import org.apache.lucene.codecs.BlockTreeTermsWriter; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java index 6c6525fe3475b..fefc73ab7479b 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PulsingPostingsFormatProvider.java @@ -19,8 +19,8 @@ package org.elasticsearch.index.codec.postingsformat; -import org.apache.lucene.codecs.BlockTreeTermsWriter; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; diff --git a/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java index 21188c0f3e9f4..79aebf91af0d6 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java +++ b/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java @@ -19,13 +19,14 @@ package org.elasticsearch.index.fielddata; +import org.apache.lucene.util.Accountable; import org.apache.lucene.util.BytesRef; import org.elasticsearch.index.fielddata.ScriptDocValues.Strings; /** * The thread safe {@link org.apache.lucene.index.AtomicReader} level cache of the data. */ -public interface AtomicFieldData