Skip to content

Commit

Permalink
Vector field (#33022)
Browse files Browse the repository at this point in the history
1. Dense vector

PUT dindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "dense_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT dinex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : [ 0.5, 10, 6 ]
}

2. Sparse vector

PUT sindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "sparse_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT sindex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : {"1": 0.5, "99": -0.5,  "5": 1}
}
  • Loading branch information
mayya-sharipova committed Dec 13, 2018
1 parent 9c1cdea commit b5d532f
Show file tree
Hide file tree
Showing 14 changed files with 1,135 additions and 0 deletions.
9 changes: 9 additions & 0 deletions docs/reference/mapping/types.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ string:: <<text,`text`>> and <<keyword,`keyword`>>

<<feature-vector>>:: Record numeric feature vectors to boost hits at query time.

<<dense-vector>>:: Record dense vectors of float values.

<<sparse-vector>>:: Record sparse vectors of float values.


[float]
=== Multi-fields

Expand Down Expand Up @@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[]
include::types/feature.asciidoc[]

include::types/feature-vector.asciidoc[]

include::types/dense-vector.asciidoc[]

include::types/sparse-vector.asciidoc[]
52 changes: 52 additions & 0 deletions docs/reference/mapping/types/dense-vector.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[[dense-vector]]
=== Dense vector datatype

A `dense_vector` field stores dense vectors of float values.
The maximum number of dimensions that can be in a vector should
not exceed 500. The number of dimensions can be
different across documents. A `dense_vector` field is
a single-valued field.

These vectors can be used for document scoring.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.

You index a dense vector as an array of floats.

[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"my_vector": {
"type": "dense_vector"
},
"my_text" : {
"type" : "keyword"
}
}
}
}
}
PUT my_index/_doc/1
{
"my_text" : "text1",
"my_vector" : [0.5, 10, 6]
}
PUT my_index/_doc/2
{
"my_text" : "text2",
"my_vector" : [-0.5, 10, 10, 4]
}
--------------------------------------------------
// CONSOLE

Internally, each document's dense vector is encoded as a binary
doc value. Its size in bytes is equal to
`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
number of the vector's dimensions.
55 changes: 55 additions & 0 deletions docs/reference/mapping/types/sparse-vector.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
[[sparse-vector]]
=== Sparse vector datatype

A `sparse_vector` field stores sparse vectors of float values.
The maximum number of dimensions that can be in a vector should
not exceed 500. The number of dimensions can be
different across documents. A `sparse_vector` field is
a single-valued field.

These vectors can be used for document scoring.
For example, a document score can represent a distance between
a given query vector and the indexed document vector.

You represent a sparse vector as an object, where object fields
are dimensions, and fields values are values for these dimensions.
Dimensions are integer values from `0` to `65535` encoded as strings.
Dimensions don't need to be in order.

[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"_doc": {
"properties": {
"my_vector": {
"type": "sparse_vector"
},
"my_text" : {
"type" : "keyword"
}
}
}
}
}
PUT my_index/_doc/1
{
"my_text" : "text1",
"my_vector" : {"1": 0.5, "5": -0.5, "100": 1}
}
PUT my_index/_doc/2
{
"my_text" : "text2",
"my_vector" : {"103": 0.5, "4": -0.5, "5": 1, "11" : 1.2}
}
--------------------------------------------------
// CONSOLE

Internally, each document's sparse vector is encoded as a binary
doc value. Its size in bytes is equal to
`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
number of the vector's dimensions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.mapper;

import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.DocValueFormat;
import org.joda.time.DateTimeZone;

import java.io.IOException;
import java.util.List;
import java.util.Map;

import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;

/**
* A {@link FieldMapper} for indexing a dense vector of floats.
*/
public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser {

public static final String CONTENT_TYPE = "dense_vector";
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
private static final byte INT_BYTES = 4;

public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType();

static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
FIELD_TYPE.setHasDocValues(true);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.freeze();
}
}

public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {

public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}

@Override
public DenseVectorFieldType fieldType() {
return (DenseVectorFieldType) super.fieldType();
}

@Override
public DenseVectorFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new DenseVectorFieldMapper(
name, fieldType, defaultFieldType,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
}
}

public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
return builder;
}
}

public static final class DenseVectorFieldType extends MappedFieldType {

public DenseVectorFieldType() {}

protected DenseVectorFieldType(DenseVectorFieldType ref) {
super(ref);
}

public DenseVectorFieldType clone() {
return new DenseVectorFieldType(this);
}

@Override
public String typeName() {
return CONTENT_TYPE;
}

@Override
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
}

@Override
public Query existsQuery(QueryShardContext context) {
return new DocValuesFieldExistsQuery(name());
}

@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
}

@Override
public Query termQuery(Object value, QueryShardContext context) {
throw new UnsupportedOperationException(
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
}
}

private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert fieldType.indexOptions() == IndexOptions.NONE;
}

@Override
protected DenseVectorFieldMapper clone() {
return (DenseVectorFieldMapper) super.clone();
}

@Override
public DenseVectorFieldType fieldType() {
return (DenseVectorFieldType) super.fieldType();
}

@Override
public void parse(ParseContext context) throws IOException {
if (context.externalValueSet()) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
}

// encode array of floats as array of integers and store into buf
// this code is here and not int the VectorEncoderDecoder so not to create extra arrays
byte[] buf = new byte[0];
int offset = 0;
int dim = 0;
for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
float value = context.parser().floatValue(true);
if (buf.length < (offset + INT_BYTES)) {
buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
}
int intValue = Float.floatToIntBits(value);
buf[offset] = (byte) (intValue >> 24);
buf[offset+1] = (byte) (intValue >> 16);
buf[offset+2] = (byte) (intValue >> 8);
buf[offset+3] = (byte) intValue;
offset += INT_BYTES;
dim++;
if (dim >= MAX_DIMS_COUNT) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
}
}
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
if (context.doc().getByKey(fieldType().name()) != null) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
"] doesn't not support indexing multiple values for the same field in the same document");
}
context.doc().addWithKey(fieldType().name(), field);
}

@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
throw new AssertionError("parse is implemented directly");
}

@Override
protected String contentType() {
return CONTENT_TYPE;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public Map<String, Mapper.TypeParser> getMappers() {
mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser());
mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser());
mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser());
mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser());
mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser());
return Collections.unmodifiableMap(mappers);
}

Expand Down
Loading

0 comments on commit b5d532f

Please sign in to comment.