Skip to content

Commit

Permalink
Backport Apache Lucene HNSW
Browse files Browse the repository at this point in the history
fix javadoc

Harden TestVectorValues.testAddIndexesDirectory01

Resolve merge conflict

Fix random TestVectorValues failures by use of forceMerge (and getOnlyLeafReader)

LUCENE-9322: Make sure to account for vectors in SortingCodecReader. (apache#2028)

LUCENE-9583: extract separate RandomAccessVectorValues interface (apache#2037)

LUCENE-9322: fix minor cosmetic refactoring error in logging string in IndexWriter's infoStream logging. It was always printing 'vector values' for all merging times instead of the other parts of Lucene index ('doc values', 'stored fields', etc.)

LUCENE-9322: Some fixes to SimpleTextVectorFormat. (apache#2071)

* Make sure the file extensions are unique.

* Fix bug in vector reading.

LUCENE-9004: KNN vector search using NSW graphs (apache#2022)

LUCENE-9610: fix TestKnnGraph.testMerge

LUCENE-9610: fix bug in previous test fix

Make Backward compatible
  • Loading branch information
Michael Sokolov authored and Anand Kotriwal committed Dec 1, 2020
1 parent 76c62e1 commit 57a9022
Show file tree
Hide file tree
Showing 91 changed files with 7,385 additions and 265 deletions.
27 changes: 17 additions & 10 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,22 @@ build
dist
lib
test-lib
/*~
/velocity.log
*~
.#*
/build.properties
/.idea
lucene/**/*.iml
parent.iml
*.ipr
*.iws
/.project
/.classpath
/.settings
/*.iml
.project
.classpath
.settings
.dir-locals.el
/.caches
/prj.el
bin
/bin
/bin.*
pom.xml
Expand All @@ -29,9 +32,13 @@ pom.xml
__pycache__
/dev-tools/scripts/scripts.iml
.DS_Store
.gradle
buildSrc

build/
.gradle/
.idea/

# Ignore the generated local settings file.
gradle.properties
gradle/
gradlew
gradlew.bat

# IntelliJ creates this folder, ignore.
dev-tools/missing-doclet/out/
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
Expand Down Expand Up @@ -119,7 +120,12 @@ public final CompoundFormat compoundFormat() {
public final PointsFormat pointsFormat() {
return new Lucene60PointsFormat();
}


@Override
public VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}

@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -128,4 +129,9 @@ public final DocValuesFormat docValuesFormat() {
public final NormsFormat normsFormat() {
return normsFormat;
}

@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -136,6 +137,11 @@ public PointsFormat pointsFormat() {
return new Lucene60PointsFormat();
}

@Override
public VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -136,6 +137,11 @@ public final PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.codecs.lucene87;

import java.util.Objects;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;

/**
* Implements the Lucene 8.6 index format, with configurable per-field postings
* and docvalues formats.
* <p>
* If you want to reuse functionality of this codec in another codec, extend
* {@link FilterCodec}.
*
* @see org.apache.lucene.codecs.lucene86 package documentation for file format details.
*
* @lucene.experimental
*/
public class Lucene87Codec extends Codec {
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
private final PointsFormat pointsFormat = new Lucene86PointsFormat();
private final PostingsFormat defaultFormat;

private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return Lucene87Codec.this.getPostingsFormatForField(field);
}
};

private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return Lucene87Codec.this.getDocValuesFormatForField(field);
}
};

private final StoredFieldsFormat storedFieldsFormat;

/**
* Instantiates a new codec.
*/
public Lucene87Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
}

/**
* Instantiates a new codec, specifying the stored fields compression
* mode to use.
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene87Codec(Lucene87StoredFieldsFormat.Mode mode) {
super("Lucene87");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.defaultFormat = new Lucene84PostingsFormat();
}

@Override
public final StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}

@Override
public final TermVectorsFormat termVectorsFormat() {
return vectorsFormat;
}

@Override
public final PostingsFormat postingsFormat() {
return postingsFormat;
}

@Override
public final FieldInfosFormat fieldInfosFormat() {
return fieldInfosFormat;
}

@Override
public final SegmentInfoFormat segmentInfoFormat() {
return segmentInfosFormat;
}

@Override
public final LiveDocsFormat liveDocsFormat() {
return liveDocsFormat;
}

@Override
public final CompoundFormat compoundFormat() {
return compoundFormat;
}

@Override
public final PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public final VectorFormat vectorFormat() { return VectorFormat.EMPTY; }

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene84".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
}

/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene80".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
* guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}

@Override
public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");

private final NormsFormat normsFormat = new Lucene80NormsFormat();

@Override
public final NormsFormat normsFormat() {
return normsFormat;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 8.7 file format.
</body>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ org.apache.lucene.codecs.lucene70.Lucene70Codec
org.apache.lucene.codecs.lucene80.Lucene80Codec
org.apache.lucene.codecs.lucene84.Lucene84Codec
org.apache.lucene.codecs.lucene86.Lucene86Codec
org.apache.lucene.codecs.lucene87.Lucene87Codec
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
Expand Down Expand Up @@ -139,7 +139,7 @@ public static IndexWriterConfig createWriterConfig(Config config, PerfRunData ru
if (defaultCodec == null && postingsFormat != null) {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
iwConf.setCodec(new Lucene87Codec() {
iwConf.setCodec(new Lucene90Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;
Expand Down

0 comments on commit 57a9022

Please sign in to comment.