From b5d532f9e3d184d4bb895835a4d4fef2fb4ee0e8 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 12 Dec 2018 21:20:53 -0500 Subject: [PATCH] Vector field (#33022) 1. Dense vector PUT dindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "dense_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT dinex/_doc/1 { "my_text" : "text1", "my_vector" : [ 0.5, 10, 6 ] } 2. Sparse vector PUT sindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "sparse_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT sindex/_doc/1 { "my_text" : "text1", "my_vector" : {"1": 0.5, "99": -0.5, "5": 1} } --- docs/reference/mapping/types.asciidoc | 9 + .../mapping/types/dense-vector.asciidoc | 52 +++++ .../mapping/types/sparse-vector.asciidoc | 55 +++++ .../index/mapper/DenseVectorFieldMapper.java | 195 +++++++++++++++++ .../index/mapper/MapperExtrasPlugin.java | 2 + .../index/mapper/SparseVectorFieldMapper.java | 207 ++++++++++++++++++ .../index/mapper/VectorEncoderDecoder.java | 141 ++++++++++++ .../mapper/DenseVectorFieldMapperTests.java | 81 +++++++ .../mapper/DenseVectorFieldTypeTests.java | 28 +++ .../mapper/SparseVectorFieldMapperTests.java | 164 ++++++++++++++ .../mapper/SparseVectorFieldTypeTests.java | 28 +++ .../mapper/VectorEncoderDecoderTests.java | 115 ++++++++++ .../test/dense-vector/10_indexing.yml | 29 +++ .../test/sparse-vector/10_indexing.yml | 29 +++ 14 files changed, 1135 insertions(+) create mode 100644 docs/reference/mapping/types/dense-vector.asciidoc create mode 100644 docs/reference/mapping/types/sparse-vector.asciidoc create mode 100644 modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java create mode 100644 modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java create mode 100644 modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java create mode 100644 modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldMapperTests.java create mode 100644 modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldTypeTests.java create mode 100644 modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldMapperTests.java create mode 100644 modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldTypeTests.java create mode 100644 modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/VectorEncoderDecoderTests.java create mode 100644 modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_indexing.yml create mode 100644 modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_indexing.yml diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 9cd55bee8553b..5daa01ea7de1a 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -47,6 +47,11 @@ string:: <> and <> <>:: Record numeric feature vectors to boost hits at query time. +<>:: Record dense vectors of float values. + +<>:: Record sparse vectors of float values. + + [float] === Multi-fields @@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[] include::types/feature.asciidoc[] include::types/feature-vector.asciidoc[] + +include::types/dense-vector.asciidoc[] + +include::types/sparse-vector.asciidoc[] \ No newline at end of file diff --git a/docs/reference/mapping/types/dense-vector.asciidoc b/docs/reference/mapping/types/dense-vector.asciidoc new file mode 100644 index 0000000000000..b09c3af4f1c1d --- /dev/null +++ b/docs/reference/mapping/types/dense-vector.asciidoc @@ -0,0 +1,52 @@ +[[dense-vector]] +=== Dense vector datatype + +A `dense_vector` field stores dense vectors of float values. +The maximum number of dimensions that can be in a vector should +not exceed 500. The number of dimensions can be +different across documents. A `dense_vector` field is +a single-valued field. + +These vectors can be used for document scoring. +For example, a document score can represent a distance between +a given query vector and the indexed document vector. + +You index a dense vector as an array of floats. + +[source,js] +-------------------------------------------------- +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "my_vector": { + "type": "dense_vector" + }, + "my_text" : { + "type" : "keyword" + } + } + } + } +} + +PUT my_index/_doc/1 +{ + "my_text" : "text1", + "my_vector" : [0.5, 10, 6] +} + +PUT my_index/_doc/2 +{ + "my_text" : "text2", + "my_vector" : [-0.5, 10, 10, 4] +} + +-------------------------------------------------- +// CONSOLE + +Internally, each document's dense vector is encoded as a binary +doc value. Its size in bytes is equal to +`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` - +number of the vector's dimensions. \ No newline at end of file diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc new file mode 100644 index 0000000000000..72df36b4887f9 --- /dev/null +++ b/docs/reference/mapping/types/sparse-vector.asciidoc @@ -0,0 +1,55 @@ +[[sparse-vector]] +=== Sparse vector datatype + +A `sparse_vector` field stores sparse vectors of float values. +The maximum number of dimensions that can be in a vector should +not exceed 500. The number of dimensions can be +different across documents. A `sparse_vector` field is +a single-valued field. + +These vectors can be used for document scoring. +For example, a document score can represent a distance between +a given query vector and the indexed document vector. + +You represent a sparse vector as an object, where object fields +are dimensions, and fields values are values for these dimensions. +Dimensions are integer values from `0` to `65535` encoded as strings. +Dimensions don't need to be in order. + +[source,js] +-------------------------------------------------- +PUT my_index +{ + "mappings": { + "_doc": { + "properties": { + "my_vector": { + "type": "sparse_vector" + }, + "my_text" : { + "type" : "keyword" + } + } + } + } +} + +PUT my_index/_doc/1 +{ + "my_text" : "text1", + "my_vector" : {"1": 0.5, "5": -0.5, "100": 1} +} + +PUT my_index/_doc/2 +{ + "my_text" : "text2", + "my_vector" : {"103": 0.5, "4": -0.5, "5": 1, "11" : 1.2} +} + +-------------------------------------------------- +// CONSOLE + +Internally, each document's sparse vector is encoded as a binary +doc value. Its size in bytes is equal to +`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` - +number of the vector's dimensions. \ No newline at end of file diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java new file mode 100644 index 0000000000000..fdcb1f54ea7dd --- /dev/null +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java @@ -0,0 +1,195 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.DocValueFormat; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken; + +/** + * A {@link FieldMapper} for indexing a dense vector of floats. + */ +public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser { + + public static final String CONTENT_TYPE = "dense_vector"; + public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions + private static final byte INT_BYTES = 4; + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setIndexOptions(IndexOptions.NONE); + FIELD_TYPE.setHasDocValues(true); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + } + + public static class Builder extends FieldMapper.Builder { + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public DenseVectorFieldType fieldType() { + return (DenseVectorFieldType) super.fieldType(); + } + + @Override + public DenseVectorFieldMapper build(BuilderContext context) { + setupFieldType(context); + return new DenseVectorFieldMapper( + name, fieldType, defaultFieldType, + context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name); + return builder; + } + } + + public static final class DenseVectorFieldType extends MappedFieldType { + + public DenseVectorFieldType() {} + + protected DenseVectorFieldType(DenseVectorFieldType ref) { + super(ref); + } + + public DenseVectorFieldType clone() { + return new DenseVectorFieldType(this); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations"); + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(name()); + } + + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating"); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support queries"); + } + } + + private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + assert fieldType.indexOptions() == IndexOptions.NONE; + } + + @Override + protected DenseVectorFieldMapper clone() { + return (DenseVectorFieldMapper) super.clone(); + } + + @Override + public DenseVectorFieldType fieldType() { + return (DenseVectorFieldType) super.fieldType(); + } + + @Override + public void parse(ParseContext context) throws IOException { + if (context.externalValueSet()) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields"); + } + + // encode array of floats as array of integers and store into buf + // this code is here and not int the VectorEncoderDecoder so not to create extra arrays + byte[] buf = new byte[0]; + int offset = 0; + int dim = 0; + for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) { + ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation); + float value = context.parser().floatValue(true); + if (buf.length < (offset + INT_BYTES)) { + buf = ArrayUtil.grow(buf, (offset + INT_BYTES)); + } + int intValue = Float.floatToIntBits(value); + buf[offset] = (byte) (intValue >> 24); + buf[offset+1] = (byte) (intValue >> 16); + buf[offset+2] = (byte) (intValue >> 8); + buf[offset+3] = (byte) intValue; + offset += INT_BYTES; + dim++; + if (dim >= MAX_DIMS_COUNT) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + + "] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]"); + } + } + BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset)); + if (context.doc().getByKey(fieldType().name()) != null) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + + "] doesn't not support indexing multiple values for the same field in the same document"); + } + context.doc().addWithKey(fieldType().name(), field); + } + + @Override + protected void parseCreateField(ParseContext context, List fields) { + throw new AssertionError("parse is implemented directly"); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } +} diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java index 513f052212f22..a875ebbb079ef 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java @@ -39,6 +39,8 @@ public Map getMappers() { mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser()); mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser()); mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser()); + mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser()); + mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser()); return Collections.unmodifiableMap(mappers); } diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java new file mode 100644 index 0000000000000..2eb360255d070 --- /dev/null +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java @@ -0,0 +1,207 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.DocValueFormat; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken; + +/** + * A {@link FieldMapper} for indexing a sparse vector of floats. + */ +public class SparseVectorFieldMapper extends FieldMapper { + + public static final String CONTENT_TYPE = "sparse_vector"; + public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions + public static int MAX_DIMS_NUMBER = 65535; //maximum allowed dimension's number + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new SparseVectorFieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setIndexOptions(IndexOptions.NONE); + FIELD_TYPE.setHasDocValues(true); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + } + + public static class Builder extends FieldMapper.Builder { + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public SparseVectorFieldType fieldType() { + return (SparseVectorFieldType) super.fieldType(); + } + + @Override + public SparseVectorFieldMapper build(BuilderContext context) { + setupFieldType(context); + return new SparseVectorFieldMapper( + name, fieldType, defaultFieldType, + context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + SparseVectorFieldMapper.Builder builder = new SparseVectorFieldMapper.Builder(name); + return builder; + } + } + + public static final class SparseVectorFieldType extends MappedFieldType { + + public SparseVectorFieldType() {} + + protected SparseVectorFieldType(SparseVectorFieldType ref) { + super(ref); + } + + public SparseVectorFieldType clone() { + return new SparseVectorFieldType(this); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations"); + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(name()); + } + + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating"); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + throw new UnsupportedOperationException( + "Field [" + name() + "] of type [" + typeName() + "] doesn't support queries"); + } + } + + + private SparseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + assert fieldType.indexOptions() == IndexOptions.NONE; + } + + @Override + protected SparseVectorFieldMapper clone() { + return (SparseVectorFieldMapper) super.clone(); + } + + @Override + public SparseVectorFieldType fieldType() { + return (SparseVectorFieldType) super.fieldType(); + } + + @Override + public void parse(ParseContext context) throws IOException { + if (context.externalValueSet()) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields"); + } + ensureExpectedToken(Token.START_OBJECT, context.parser().currentToken(), context.parser()::getTokenLocation); + int[] dims = new int[0]; + float[] values = new float[0]; + int dimCount = 0; + int dim = 0; + float value; + for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) { + if (token == Token.FIELD_NAME) { + try { + dim = Integer.parseInt(context.parser().currentName()); + if (dim < 0 || dim > MAX_DIMS_NUMBER) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimension number " + + "must be a non-negative integer value not exceeding [" + MAX_DIMS_NUMBER + "], got [" + dim + "]"); + } + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimensions should be " + + "integers represented as strings, but got [" + context.parser().currentName() + "]", e); + } + } else if (token == Token.VALUE_NUMBER) { + value = context.parser().floatValue(true); + if (dims.length <= dimCount) { // ensure arrays have enough capacity + values = ArrayUtil.grow(values, dimCount + 1); + dims = ArrayUtil.grow(dims, dimCount + 1); + } + dims[dimCount] = dim; + values[dimCount] = value; + dimCount ++; + if (dimCount >= MAX_DIMS_COUNT) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + + "] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]"); + } + } else { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + + "] takes an object that maps a dimension number to a float, " + "but got unexpected token [" + token + "]"); + } + } + + BytesRef br = VectorEncoderDecoder.encodeSparseVector(dims, values, dimCount); + BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), br); + context.doc().addWithKey(fieldType().name(), field); + } + + + @Override + protected void parseCreateField(ParseContext context, List fields) { + throw new AssertionError("parse is implemented directly"); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } +} diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java new file mode 100644 index 0000000000000..c21b006c8836b --- /dev/null +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java @@ -0,0 +1,141 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.InPlaceMergeSorter; + +// static utility functions for encoding and decoding dense_vector and sparse_vector fields +final class VectorEncoderDecoder { + static final byte INT_BYTES = 4; + static final byte SHORT_BYTES = 2; + + private VectorEncoderDecoder() { } + + /** + * Encodes a sparse array represented by values, dims and dimCount into a bytes array - BytesRef + * BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension + * @param values - values of the sparse array + * @param dims - dims of the sparse array + * @param dimCount - number of the dimension + * @return BytesRef + */ + static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) { + // 1. Sort dims and values + sortSparseDimsValues(dims, values, dimCount); + byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)]; + + // 2. Encode dimensions + // as each dimension is a positive value that doesn't exceed 65535, 2 bytes is enough for encoding it + int offset = 0; + for (int dim = 0; dim < dimCount; dim++) { + buf[offset] = (byte) (dims[dim] >> 8); + buf[offset+1] = (byte) dims[dim]; + offset += SHORT_BYTES; + } + + // 3. Encode values + for (int dim = 0; dim < dimCount; dim++) { + int intValue = Float.floatToIntBits(values[dim]); + buf[offset] = (byte) (intValue >> 24); + buf[offset+1] = (byte) (intValue >> 16); + buf[offset+2] = (byte) (intValue >> 8); + buf[offset+3] = (byte) intValue; + offset += INT_BYTES; + } + + return new BytesRef(buf); + } + + /** + * Decodes the first part of BytesRef into sparse vector dimensions + * @param vectorBR - vector decoded in BytesRef + */ + static int[] decodeSparseVectorDims(BytesRef vectorBR) { + int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES); + int[] dims = new int[dimCount]; + int offset = vectorBR.offset; + for (int dim = 0; dim < dimCount; dim++) { + dims[dim] = ((vectorBR.bytes[offset] & 0xFF) << 8) | (vectorBR.bytes[offset+1] & 0xFF); + offset += SHORT_BYTES; + } + return dims; + } + + /** + * Decodes the second part of the BytesRef into sparse vector values + * @param vectorBR - vector decoded in BytesRef + */ + static float[] decodeSparseVector(BytesRef vectorBR) { + int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES); + int offset = vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded + float[] vector = new float[dimCount]; + for (int dim = 0; dim < dimCount; dim++) { + int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) | + ((vectorBR.bytes[offset+1] & 0xFF) << 16) | + ((vectorBR.bytes[offset+2] & 0xFF) << 8) | + (vectorBR.bytes[offset+3] & 0xFF); + vector[dim] = Float.intBitsToFloat(intValue); + offset = offset + INT_BYTES; + } + return vector; + } + + + /** + Sort dimensions in the ascending order and + sort values in the same order as their corresponding dimensions + **/ + static void sortSparseDimsValues(int[] dims, float[] values, int n) { + new InPlaceMergeSorter() { + @Override + public int compare(int i, int j) { + return Integer.compare(dims[i], dims[j]); + } + + @Override + public void swap(int i, int j) { + int tempDim = dims[i]; + dims[i] = dims[j]; + dims[j] = tempDim; + + float tempValue = values[j]; + values[j] = values[i]; + values[i] = tempValue; + } + }.sort(0, n); + } + + // Decodes a BytesRef into an array of floats + static float[] decodeDenseVector(BytesRef vectorBR) { + int dimCount = vectorBR.length / INT_BYTES; + float[] vector = new float[dimCount]; + int offset = vectorBR.offset; + for (int dim = 0; dim < dimCount; dim++) { + int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24) | + ((vectorBR.bytes[offset+1] & 0xFF) << 16) | + ((vectorBR.bytes[offset+2] & 0xFF) << 8) | + (vectorBR.bytes[offset+3] & 0xFF); + vector[dim] = Float.intBitsToFloat(intValue); + offset = offset + INT_BYTES; + } + return vector; + } +} diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldMapperTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldMapperTests.java new file mode 100644 index 0000000000000..db0bf5d48d444 --- /dev/null +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldMapperTests.java @@ -0,0 +1,81 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.hamcrest.Matchers; + +import java.util.Collection; + +public class DenseVectorFieldMapperTests extends ESSingleNodeTestCase { + + @Override + protected Collection> getPlugins() { + return pluginList(MapperExtrasPlugin.class); + } + + public void testDefaults() throws Exception { + IndexService indexService = createIndex("test-index"); + DocumentMapperParser parser = indexService.mapperService().documentMapperParser(); + String mapping = Strings.toString(XContentFactory.jsonBuilder() + .startObject() + .startObject("_doc") + .startObject("properties") + .startObject("my-dense-vector").field("type", "dense_vector") + .endObject() + .endObject() + .endObject() + .endObject()); + + DocumentMapper mapper = parser.parse("_doc", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + float[] expectedArray = {-12.1f, 100.7f, -4}; + ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startArray("my-dense-vector").value(expectedArray[0]).value(expectedArray[1]).value(expectedArray[2]).endArray() + .endObject()), + XContentType.JSON)); + IndexableField[] fields = doc1.rootDoc().getFields("my-dense-vector"); + assertEquals(1, fields.length); + assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class)); + + // assert that after decoding the indexed value is equal to expected + BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue(); + float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(vectorBR); + assertArrayEquals( + "Decoded dense vector values is not equal to the indexed one.", + expectedArray, + decodedValues, + 0.001f + ); + } +} diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldTypeTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldTypeTests.java new file mode 100644 index 0000000000000..7bbf862e0fc2b --- /dev/null +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldTypeTests.java @@ -0,0 +1,28 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +public class DenseVectorFieldTypeTests extends FieldTypeTestCase { + + @Override + protected MappedFieldType createDefaultFieldType() { + return new DenseVectorFieldMapper.DenseVectorFieldType(); + } +} diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldMapperTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldMapperTests.java new file mode 100644 index 0000000000000..e805cf81bdf8c --- /dev/null +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldMapperTests.java @@ -0,0 +1,164 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.hamcrest.Matchers; +import org.junit.Before; + +import java.util.Collection; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.core.IsInstanceOf.instanceOf; + +public class SparseVectorFieldMapperTests extends ESSingleNodeTestCase { + private DocumentMapper mapper; + + @Before + public void setup() throws Exception { + IndexService indexService = createIndex("test-index"); + DocumentMapperParser parser = indexService.mapperService().documentMapperParser(); + String mapping = Strings.toString(XContentFactory.jsonBuilder() + .startObject() + .startObject("_doc") + .startObject("properties") + .startObject("my-sparse-vector").field("type", "sparse_vector") + .endObject() + .endObject() + .endObject() + .endObject()); + mapper = parser.parse("_doc", new CompressedXContent(mapping)); + } + + @Override + protected Collection> getPlugins() { + return pluginList(MapperExtrasPlugin.class); + } + + public void testDefaults() throws Exception { + int[] indexedDims = {65535, 50, 2}; + float[] indexedValues = {0.5f, 1800f, -34567.11f}; + ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startObject("my-sparse-vector") + .field(Integer.toString(indexedDims[0]), indexedValues[0]) + .field(Integer.toString(indexedDims[1]), indexedValues[1]) + .field(Integer.toString(indexedDims[2]), indexedValues[2]) + .endObject() + .endObject()), + XContentType.JSON)); + IndexableField[] fields = doc1.rootDoc().getFields("my-sparse-vector"); + assertEquals(1, fields.length); + assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class)); + + // assert that after decoding the indexed values are equal to expected + int[] expectedDims = {2, 50, 65535}; //the same as indexed but sorted + float[] expectedValues = {-34567.11f, 1800f, 0.5f}; //the same as indexed but sorted by their dimensions + + // assert that after decoding the indexed dims and values are equal to expected + BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue(); + int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(vectorBR); + assertArrayEquals( + "Decoded sparse vector dimensions are not equal to the indexed ones.", + expectedDims, + decodedDims + ); + float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(vectorBR); + assertArrayEquals( + "Decoded sparse vector values are not equal to the indexed ones.", + expectedValues, + decodedValues, + 0.001f + ); + } + + public void testErrors() { + // 1. test for an error on negative dimension + MapperParsingException e = expectThrows(MapperParsingException.class, () -> { + mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startObject("my-sparse-vector") + .field(Integer.toString(-50), 100f) + .endObject() + .endObject()), + XContentType.JSON)); + }); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + assertThat(e.getCause().getMessage(), containsString( + "dimension number must be a non-negative integer value not exceeding [65535], got [-50]")); + + // 2. test for an error on a dimension greater than MAX_DIMS_NUMBER + e = expectThrows(MapperParsingException.class, () -> { + mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startObject("my-sparse-vector") + .field(Integer.toString(70000), 100f) + .endObject() + .endObject()), + XContentType.JSON)); + }); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + assertThat(e.getCause().getMessage(), containsString( + "dimension number must be a non-negative integer value not exceeding [65535], got [70000]")); + + // 3. test for an error on a wrong formatted dimension + e = expectThrows(MapperParsingException.class, () -> { + mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startObject("my-sparse-vector") + .field("WrongDim123", 100f) + .endObject() + .endObject()), + XContentType.JSON)); + }); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + assertThat(e.getCause().getMessage(), containsString( + "dimensions should be integers represented as strings, but got [WrongDim123]")); + + // 4. test for an error on a wrong format for the map of dims to values + e = expectThrows(MapperParsingException.class, () -> { + mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference + .bytes(XContentFactory.jsonBuilder() + .startObject() + .startObject("my-sparse-vector") + .startArray(Integer.toString(10)).value(10f).value(100f).endArray() + .endObject() + .endObject()), + XContentType.JSON)); + }); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + assertThat(e.getCause().getMessage(), containsString( + "takes an object that maps a dimension number to a float, but got unexpected token [START_ARRAY]")); + } +} diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldTypeTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldTypeTests.java new file mode 100644 index 0000000000000..9fe8d17e697e6 --- /dev/null +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldTypeTests.java @@ -0,0 +1,28 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +public class SparseVectorFieldTypeTests extends FieldTypeTestCase { + + @Override + protected MappedFieldType createDefaultFieldType() { + return new SparseVectorFieldMapper.SparseVectorFieldType(); + } +} diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/VectorEncoderDecoderTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/VectorEncoderDecoderTests.java new file mode 100644 index 0000000000000..67ab78261375e --- /dev/null +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/VectorEncoderDecoderTests.java @@ -0,0 +1,115 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashSet; +import java.util.Set; +import java.util.Arrays; + +public class VectorEncoderDecoderTests extends ESTestCase { + + public void testDenseVectorEncodingDecoding() { + int dimCount = randomIntBetween(0, 300); + float[] expectedValues = new float[dimCount]; + for (int i = 0; i < dimCount; i++) { + expectedValues[i] = randomFloat(); + } + + // test that values that went through encoding and decoding are equal to their original + BytesRef encodedDenseVector = mockEncodeDenseVector(expectedValues); + float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(encodedDenseVector); + assertArrayEquals( + "Decoded dense vector values are not equal to their original.", + expectedValues, + decodedValues, + 0.001f + ); + + } + + public void testSparseVectorEncodingDecoding() { + int dimCount = randomIntBetween(0, 100); + float[] expectedValues = new float[dimCount]; + int[] expectedDims = randomUniqueDims(dimCount); + for (int i = 0; i < dimCount; i++) { + expectedValues[i] = randomFloat(); + } + + // test that sorting in the encoding works as expected + int[] sortedDims = Arrays.copyOf(expectedDims, dimCount); + Arrays.sort(sortedDims); + VectorEncoderDecoder.sortSparseDimsValues(expectedDims, expectedValues, dimCount); + assertArrayEquals( + "Sparse vector dims are not properly sorted!", + sortedDims, + expectedDims + ); + + // test that values that went through encoding and decoding are equal to their original + BytesRef encodedSparseVector = VectorEncoderDecoder.encodeSparseVector(expectedDims, expectedValues, dimCount); + int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(encodedSparseVector); + float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(encodedSparseVector); + assertArrayEquals( + "Decoded sparse vector dims are not equal to their original!", + expectedDims, + decodedDims + ); + assertArrayEquals( + "Decoded sparse vector values are not equal to their original.", + expectedValues, + decodedValues, + 0.001f + ); + } + + // imitates the code in DenseVectorFieldMapper::parse + private BytesRef mockEncodeDenseVector(float[] dims) { + final short INT_BYTES = VectorEncoderDecoder.INT_BYTES; + byte[] buf = new byte[INT_BYTES * dims.length]; + int offset = 0; + int intValue; + for (float value: dims) { + intValue = Float.floatToIntBits(value); + buf[offset] = (byte) (intValue >> 24); + buf[offset+1] = (byte) (intValue >> 16); + buf[offset+2] = (byte) (intValue >> 8); + buf[offset+3] = (byte) intValue; + offset += INT_BYTES; + } + return new BytesRef(buf, 0, offset); + } + + // generate unique random dims + private int[] randomUniqueDims(int dimCount) { + int[] values = new int[dimCount]; + Set usedValues = new HashSet<>(); + int value; + for (int i = 0; i < dimCount; i++) { + value = randomValueOtherThanMany(usedValues::contains, () -> randomIntBetween(0, SparseVectorFieldMapper.MAX_DIMS_NUMBER)); + usedValues.add(value); + values[i] = value; + } + return values; + } + +} diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_indexing.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_indexing.yml new file mode 100644 index 0000000000000..ef31d0f45e240 --- /dev/null +++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_indexing.yml @@ -0,0 +1,29 @@ +setup: + - skip: + version: " - 6.99.99" + reason: "dense_vector field was introduced in 7.0.0" + + - do: + indices.create: + index: test-index + body: + settings: + number_of_replicas: 0 + mappings: + _doc: + properties: + my_dense_vector: + type: dense_vector + + +--- +"Indexing": + - do: + index: + index: test-index + type: _doc + id: 1 + body: + my_dense_vector: [1.5, -10, 3455, 345452.4545] + + - match: { result: created } diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_indexing.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_indexing.yml new file mode 100644 index 0000000000000..87d599e9cb078 --- /dev/null +++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_indexing.yml @@ -0,0 +1,29 @@ +setup: + - skip: + version: " - 6.99.99" + reason: "sparse_vector field was introduced in 7.0.0" + + - do: + indices.create: + index: test-index + body: + settings: + number_of_replicas: 0 + mappings: + _doc: + properties: + my_sparse_vector: + type: sparse_vector + + +--- +"Indexing": + - do: + index: + index: test-index + type: _doc + id: 1 + body: + my_sparse_vector: { "50" : 1.8, "2" : -0.4, "10" : 1000.3, "4545" : -0.00004} + + - match: { result: created }