-
Notifications
You must be signed in to change notification settings - Fork 24.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1. Dense vector PUT dindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "dense_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT dinex/_doc/1 { "my_text" : "text1", "my_vector" : [ 0.5, 10, 6 ] } 2. Sparse vector PUT sindex { "mappings": { "_doc": { "properties": { "my_vector": { "type": "sparse_vector" }, "my_text" : { "type" : "keyword" } } } } } PUT sindex/_doc/1 { "my_text" : "text1", "my_vector" : {"1": 0.5, "99": -0.5, "5": 1} }
- Loading branch information
1 parent
9c1cdea
commit b5d532f
Showing
14 changed files
with
1,135 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
[[dense-vector]] | ||
=== Dense vector datatype | ||
|
||
A `dense_vector` field stores dense vectors of float values. | ||
The maximum number of dimensions that can be in a vector should | ||
not exceed 500. The number of dimensions can be | ||
different across documents. A `dense_vector` field is | ||
a single-valued field. | ||
|
||
These vectors can be used for document scoring. | ||
For example, a document score can represent a distance between | ||
a given query vector and the indexed document vector. | ||
|
||
You index a dense vector as an array of floats. | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
PUT my_index | ||
{ | ||
"mappings": { | ||
"_doc": { | ||
"properties": { | ||
"my_vector": { | ||
"type": "dense_vector" | ||
}, | ||
"my_text" : { | ||
"type" : "keyword" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
PUT my_index/_doc/1 | ||
{ | ||
"my_text" : "text1", | ||
"my_vector" : [0.5, 10, 6] | ||
} | ||
PUT my_index/_doc/2 | ||
{ | ||
"my_text" : "text2", | ||
"my_vector" : [-0.5, 10, 10, 4] | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
|
||
Internally, each document's dense vector is encoded as a binary | ||
doc value. Its size in bytes is equal to | ||
`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` - | ||
number of the vector's dimensions. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
[[sparse-vector]] | ||
=== Sparse vector datatype | ||
|
||
A `sparse_vector` field stores sparse vectors of float values. | ||
The maximum number of dimensions that can be in a vector should | ||
not exceed 500. The number of dimensions can be | ||
different across documents. A `sparse_vector` field is | ||
a single-valued field. | ||
|
||
These vectors can be used for document scoring. | ||
For example, a document score can represent a distance between | ||
a given query vector and the indexed document vector. | ||
|
||
You represent a sparse vector as an object, where object fields | ||
are dimensions, and fields values are values for these dimensions. | ||
Dimensions are integer values from `0` to `65535` encoded as strings. | ||
Dimensions don't need to be in order. | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
PUT my_index | ||
{ | ||
"mappings": { | ||
"_doc": { | ||
"properties": { | ||
"my_vector": { | ||
"type": "sparse_vector" | ||
}, | ||
"my_text" : { | ||
"type" : "keyword" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
PUT my_index/_doc/1 | ||
{ | ||
"my_text" : "text1", | ||
"my_vector" : {"1": 0.5, "5": -0.5, "100": 1} | ||
} | ||
PUT my_index/_doc/2 | ||
{ | ||
"my_text" : "text2", | ||
"my_vector" : {"103": 0.5, "4": -0.5, "5": 1, "11" : 1.2} | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
|
||
Internally, each document's sparse vector is encoded as a binary | ||
doc value. Its size in bytes is equal to | ||
`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` - | ||
number of the vector's dimensions. |
195 changes: 195 additions & 0 deletions
195
...es/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.index.mapper; | ||
|
||
import org.apache.lucene.document.BinaryDocValuesField; | ||
import org.apache.lucene.index.IndexOptions; | ||
import org.apache.lucene.index.IndexableField; | ||
import org.apache.lucene.search.DocValuesFieldExistsQuery; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.util.ArrayUtil; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.common.xcontent.XContentParser.Token; | ||
import org.elasticsearch.index.fielddata.IndexFieldData; | ||
import org.elasticsearch.index.query.QueryShardContext; | ||
import org.elasticsearch.search.DocValueFormat; | ||
import org.joda.time.DateTimeZone; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken; | ||
|
||
/** | ||
* A {@link FieldMapper} for indexing a dense vector of floats. | ||
*/ | ||
public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser { | ||
|
||
public static final String CONTENT_TYPE = "dense_vector"; | ||
public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions | ||
private static final byte INT_BYTES = 4; | ||
|
||
public static class Defaults { | ||
public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType(); | ||
|
||
static { | ||
FIELD_TYPE.setTokenized(false); | ||
FIELD_TYPE.setIndexOptions(IndexOptions.NONE); | ||
FIELD_TYPE.setHasDocValues(true); | ||
FIELD_TYPE.setOmitNorms(true); | ||
FIELD_TYPE.freeze(); | ||
} | ||
} | ||
|
||
public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> { | ||
|
||
public Builder(String name) { | ||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); | ||
builder = this; | ||
} | ||
|
||
@Override | ||
public DenseVectorFieldType fieldType() { | ||
return (DenseVectorFieldType) super.fieldType(); | ||
} | ||
|
||
@Override | ||
public DenseVectorFieldMapper build(BuilderContext context) { | ||
setupFieldType(context); | ||
return new DenseVectorFieldMapper( | ||
name, fieldType, defaultFieldType, | ||
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); | ||
} | ||
} | ||
|
||
public static class TypeParser implements Mapper.TypeParser { | ||
@Override | ||
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException { | ||
DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name); | ||
return builder; | ||
} | ||
} | ||
|
||
public static final class DenseVectorFieldType extends MappedFieldType { | ||
|
||
public DenseVectorFieldType() {} | ||
|
||
protected DenseVectorFieldType(DenseVectorFieldType ref) { | ||
super(ref); | ||
} | ||
|
||
public DenseVectorFieldType clone() { | ||
return new DenseVectorFieldType(this); | ||
} | ||
|
||
@Override | ||
public String typeName() { | ||
return CONTENT_TYPE; | ||
} | ||
|
||
@Override | ||
public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) { | ||
throw new UnsupportedOperationException( | ||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations"); | ||
} | ||
|
||
@Override | ||
public Query existsQuery(QueryShardContext context) { | ||
return new DocValuesFieldExistsQuery(name()); | ||
} | ||
|
||
@Override | ||
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { | ||
throw new UnsupportedOperationException( | ||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating"); | ||
} | ||
|
||
@Override | ||
public Query termQuery(Object value, QueryShardContext context) { | ||
throw new UnsupportedOperationException( | ||
"Field [" + name() + "] of type [" + typeName() + "] doesn't support queries"); | ||
} | ||
} | ||
|
||
private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, | ||
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { | ||
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); | ||
assert fieldType.indexOptions() == IndexOptions.NONE; | ||
} | ||
|
||
@Override | ||
protected DenseVectorFieldMapper clone() { | ||
return (DenseVectorFieldMapper) super.clone(); | ||
} | ||
|
||
@Override | ||
public DenseVectorFieldType fieldType() { | ||
return (DenseVectorFieldType) super.fieldType(); | ||
} | ||
|
||
@Override | ||
public void parse(ParseContext context) throws IOException { | ||
if (context.externalValueSet()) { | ||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields"); | ||
} | ||
|
||
// encode array of floats as array of integers and store into buf | ||
// this code is here and not int the VectorEncoderDecoder so not to create extra arrays | ||
byte[] buf = new byte[0]; | ||
int offset = 0; | ||
int dim = 0; | ||
for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) { | ||
ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation); | ||
float value = context.parser().floatValue(true); | ||
if (buf.length < (offset + INT_BYTES)) { | ||
buf = ArrayUtil.grow(buf, (offset + INT_BYTES)); | ||
} | ||
int intValue = Float.floatToIntBits(value); | ||
buf[offset] = (byte) (intValue >> 24); | ||
buf[offset+1] = (byte) (intValue >> 16); | ||
buf[offset+2] = (byte) (intValue >> 8); | ||
buf[offset+3] = (byte) intValue; | ||
offset += INT_BYTES; | ||
dim++; | ||
if (dim >= MAX_DIMS_COUNT) { | ||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + | ||
"] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]"); | ||
} | ||
} | ||
BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset)); | ||
if (context.doc().getByKey(fieldType().name()) != null) { | ||
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + | ||
"] doesn't not support indexing multiple values for the same field in the same document"); | ||
} | ||
context.doc().addWithKey(fieldType().name(), field); | ||
} | ||
|
||
@Override | ||
protected void parseCreateField(ParseContext context, List<IndexableField> fields) { | ||
throw new AssertionError("parse is implemented directly"); | ||
} | ||
|
||
@Override | ||
protected String contentType() { | ||
return CONTENT_TYPE; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.