Vector field (#33022)

1. Dense vector PUT dindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "dense_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT dinex/_doc/1 { "my_text" : "text1", "my_vector" : [ 0.5, 10, 6 ] } 2. Sparse vector PUT sindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "sparse_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT sindex/_doc/1 { "my_text" : "text1", "my_vector" : {"1": 0.5, "99": -0.5, "5": 1} }
elastic · Dec 13, 2018 · b5d532f · b5d532f
1 parent 9c1cdea
commit b5d532f
Show file tree

Hide file tree

Showing 14 changed files with 1,135 additions and 0 deletions.
diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc
@@ -47,6 +47,11 @@ string::        <<text,`text`>> and <<keyword,`keyword`>>
 
 <<feature-vector>>:: Record numeric feature vectors to boost hits at query time.
 
+<<dense-vector>>::   Record dense vectors of float values.
+
+<<sparse-vector>>::   Record sparse vectors of float values.
+
+
 [float]
 === Multi-fields
 
@@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[]
 include::types/feature.asciidoc[]
 
 include::types/feature-vector.asciidoc[]
+
+include::types/dense-vector.asciidoc[]
+
+include::types/sparse-vector.asciidoc[]
diff --git a/docs/reference/mapping/types/dense-vector.asciidoc b/docs/reference/mapping/types/dense-vector.asciidoc
@@ -0,0 +1,52 @@
+[[dense-vector]]
+=== Dense vector datatype
+
+A `dense_vector` field stores dense vectors of float values.
+The maximum number of dimensions that can be in a vector should
+not exceed 500. The number of dimensions can be
+different across documents. A `dense_vector` field is
+a single-valued field.
+
+These vectors can be used for document scoring.
+For example, a document score can represent a distance between
+a given query vector and the indexed document vector.
+
+You index a dense vector as an array of floats.
+
+[source,js]
+--------------------------------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_vector": {
+          "type": "dense_vector"
+        },
+        "my_text" : {
+          "type" : "keyword"
+        }
+      }
+    }
+  }
+}
+
+PUT my_index/_doc/1
+{
+  "my_text" : "text1",
+  "my_vector" : [0.5, 10, 6]
+}
+
+PUT my_index/_doc/2
+{
+  "my_text" : "text2",
+  "my_vector" : [-0.5, 10, 10, 4]
+}
+
+--------------------------------------------------
+// CONSOLE
+
+Internally, each document's dense vector is encoded as a binary
+doc value. Its size in bytes is equal to
+`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
+number of the vector's dimensions.
diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc
@@ -0,0 +1,55 @@
+[[sparse-vector]]
+=== Sparse vector datatype
+
+A `sparse_vector` field stores sparse vectors of float values.
+The maximum number of dimensions that can be in a vector should
+not exceed 500. The number of dimensions can be
+different across documents. A `sparse_vector` field is
+a single-valued field.
+
+These vectors can be used for document scoring.
+For example, a document score can represent a distance between
+a given query vector and the indexed document vector.
+
+You represent a sparse vector as an object, where object fields
+are dimensions, and fields values are values for these dimensions.
+Dimensions are integer values from `0` to `65535` encoded as strings.
+Dimensions don't need to be in order.
+
+[source,js]
+--------------------------------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_vector": {
+          "type": "sparse_vector"
+        },
+        "my_text" : {
+          "type" : "keyword"
+        }
+      }
+    }
+  }
+}
+
+PUT my_index/_doc/1
+{
+  "my_text" : "text1",
+  "my_vector" : {"1": 0.5, "5": -0.5,  "100": 1}
+}
+
+PUT my_index/_doc/2
+{
+  "my_text" : "text2",
+  "my_vector" : {"103": 0.5, "4": -0.5,  "5": 1, "11" : 1.2}
+}
+
+--------------------------------------------------
+// CONSOLE
+
+Internally, each document's sparse vector is encoded as a binary
+doc value. Its size in bytes is equal to
+`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
+number of the vector's dimensions.
diff --git a/...es/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java b/...es/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.DocValuesFieldExistsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentParser.Token;
+import org.elasticsearch.index.fielddata.IndexFieldData;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.search.DocValueFormat;
+import org.joda.time.DateTimeZone;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
+
+/**
+ * A {@link FieldMapper} for indexing a dense vector of floats.
+ */
+public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser {
+
+    public static final String CONTENT_TYPE = "dense_vector";
+    public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
+    private static final byte INT_BYTES = 4;
+
+    public static class Defaults {
+        public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType();
+
+        static {
+            FIELD_TYPE.setTokenized(false);
+            FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
+            FIELD_TYPE.setHasDocValues(true);
+            FIELD_TYPE.setOmitNorms(true);
+            FIELD_TYPE.freeze();
+        }
+    }
+
+    public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {
+
+        public Builder(String name) {
+            super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
+            builder = this;
+        }
+
+        @Override
+        public DenseVectorFieldType fieldType() {
+            return (DenseVectorFieldType) super.fieldType();
+        }
+
+        @Override
+        public DenseVectorFieldMapper build(BuilderContext context) {
+            setupFieldType(context);
+            return new DenseVectorFieldMapper(
+                    name, fieldType, defaultFieldType,
+                    context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
+        }
+    }
+
+    public static class TypeParser implements Mapper.TypeParser {
+        @Override
+        public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+            DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
+            return builder;
+        }
+    }
+
+    public static final class DenseVectorFieldType extends MappedFieldType {
+
+        public DenseVectorFieldType() {}
+
+        protected DenseVectorFieldType(DenseVectorFieldType ref) {
+            super(ref);
+        }
+
+        public DenseVectorFieldType clone() {
+            return new DenseVectorFieldType(this);
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            return new DocValuesFieldExistsQuery(name());
+        }
+
+        @Override
+        public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
+        }
+
+        @Override
+        public Query termQuery(Object value, QueryShardContext context) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
+        }
+    }
+
+    private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
+                                   Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
+        super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
+        assert fieldType.indexOptions() == IndexOptions.NONE;
+    }
+
+    @Override
+    protected DenseVectorFieldMapper clone() {
+        return (DenseVectorFieldMapper) super.clone();
+    }
+
+    @Override
+    public DenseVectorFieldType fieldType() {
+        return (DenseVectorFieldType) super.fieldType();
+    }
+
+    @Override
+    public void parse(ParseContext context) throws IOException {
+        if (context.externalValueSet()) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
+        }
+
+        // encode array of floats as array of integers and store into buf
+        // this code is here and not int the VectorEncoderDecoder so not to create extra arrays
+        byte[] buf = new byte[0];
+        int offset = 0;
+        int dim = 0;
+        for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
+            ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
+            float value = context.parser().floatValue(true);
+            if (buf.length < (offset + INT_BYTES)) {
+                buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
+            }
+            int intValue = Float.floatToIntBits(value);
+            buf[offset] =  (byte) (intValue >> 24);
+            buf[offset+1] = (byte) (intValue >> 16);
+            buf[offset+2] = (byte) (intValue >>  8);
+            buf[offset+3] = (byte) intValue;
+            offset += INT_BYTES;
+            dim++;
+            if (dim >= MAX_DIMS_COUNT) {
+                throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                    "] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
+            }
+        }
+        BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
+        if (context.doc().getByKey(fieldType().name()) != null) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                "] doesn't not support indexing multiple values for the same field in the same document");
+        }
+        context.doc().addWithKey(fieldType().name(), field);
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
+        throw new AssertionError("parse is implemented directly");
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+}
diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java
@@ -39,6 +39,8 @@ public Map<String, Mapper.TypeParser> getMappers() {
         mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser());
         mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser());
         mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser());
+        mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser());
+        mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser());
         return Collections.unmodifiableMap(mappers);
     }