Skip to content

Commit

Permalink
Bring back sparse_vector mapping (elastic#98996)
Browse files Browse the repository at this point in the history
Adds back `sparse_vector` field type, as a copy of `rank_features`. 

The main goal is to have the `sparse_vector` field type available so we
can switch ELSER queries to use the new type.
  • Loading branch information
carlosdelest authored and alex-spies committed Sep 8, 2023
1 parent 3f55232 commit 2562894
Show file tree
Hide file tree
Showing 7 changed files with 468 additions and 115 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/98996.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 98996
summary: Reintroduce `sparse_vector` mapping
area: Mapping
type: enhancement
issues: []
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
---
"Indexing and searching sparse vectors":

- skip:
version: " - 8.10.99"
reason: "sparse_vector field type reintroduced in 8.11"

- do:
indices.create:
index: test
body:
settings:
number_of_replicas: 0
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
text: "running is good for you"
ml:
tokens:
running: 2.4097164
good: 2.170997
run: 2.052153
race: 1.4575411
for: 1.1908325
runner: 1.1803857
exercise: 1.1652642
you: 0.9654308
training: 0.94999343
sports: 0.93650943
fitness: 0.83129317
best: 0.820365
bad: 0.7385934
health: 0.7098149
marathon: 0.61555296
gym: 0.5652374

- match: { result: "created" }

- do:
index:
index: test
id: "2"
body:
text: "walking is a healthy exercise"
ml:
tokens:
walking: 2.4797723
exercise: 2.074234
healthy: 1.971596
walk: 1.6458614
health: 1.5291847
walker: 1.4736869
activity: 1.0793462
good: 1.0597849
fitness: 0.91855437
training: 0.86342937
movement: 0.7657065
normal: 0.6694081
foot: 0.5892523
physical: 0.4926789

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
query:
bool:
should:
- term:
ml.tokens:
value: "walk"
boost: 1.9790847
- term:
ml.tokens:
value: "walking"
boost: 1.7092685
- term:
ml.tokens:
value: "exercise"
boost: 0.84076905

- match: { hits.total.value: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }

---
"Sparse vector in 7.x":
- skip:
features: allowed_warnings
version: "8.0.0 - "
reason: "sparse_vector field type supported in 7.x"
- do:
allowed_warnings:
- "The [sparse_vector] field type is deprecated and will be removed in 8.0."
- "[sparse_vector] field type in old 7.x indices is allowed to contain [sparse_vector] fields, but they cannot be indexed or searched."
indices.create:
index: test
body:
settings:
number_of_replicas: 0
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector

- match: { acknowledged: true }

---
"Sparse vector in 8.x":
- skip:
version: " - 7.99.99, 8.11.0 - "
reason: "sparse_vector field type not supported in 8.x until 8.11.0"
- do:
catch: /The \[sparse_vector\] field type is no longer supported/
indices.create:
index: test
body:
settings:
number_of_replicas: 0
mappings:
properties:
text:
type: text
ml.tokens:
type: sparse_vector
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ private static IndexVersion registerIndexVersion(int id, Version luceneVersion,
* Detached index versions added below here.
*/
public static final IndexVersion V_8_500_000 = registerIndexVersion(8_500_000, Version.LUCENE_9_7_0, "bf656f5e-5808-4eee-bf8a-e2bf6736ff55");
public static final IndexVersion V_8_500_001 = registerIndexVersion(8_500_001, Version.LUCENE_9_7_0, "45045a5a-fc57-4462-89f6-6bc04cda6015");
/*
* STOP! READ THIS FIRST! No, really,
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _
Expand Down Expand Up @@ -149,7 +150,7 @@ private static IndexVersion registerIndexVersion(int id, Version luceneVersion,
*/

private static class CurrentHolder {
private static final IndexVersion CURRENT = findCurrent(V_8_500_000);
private static final IndexVersion CURRENT = findCurrent(V_8_500_001);

// finds the pluggable current version, or uses the given fallback
private static IndexVersion findCurrent(IndexVersion fallback) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,48 @@

package org.elasticsearch.index.mapper.vectors;

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.FieldDataContext;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.mapper.DocumentParserContext;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.xcontent.XContentParser.Token;

import java.time.ZoneId;
import java.io.IOException;
import java.util.Map;

import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;

/**
* A {@link FieldMapper} for indexing a sparse vector of floats.
*
* @deprecated The sparse_vector type was deprecated in 7.x and removed in 8.0. This mapper
* definition only exists so that 7.x indices can be read without error.
*
* TODO: remove in 9.0.
* A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse
* vector of features.
*/
@Deprecated
public class SparseVectorFieldMapper extends FieldMapper {
private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(SparseVectorFieldMapper.class);
static final String ERROR_MESSAGE = "The [sparse_vector] field type is no longer supported.";
static final String ERROR_MESSAGE_7X = "The [sparse_vector] field type is no longer supported. Old 7.x indices are allowed to "
+ "contain [sparse_vector] fields, but they cannot be indexed or searched.";

public static final String CONTENT_TYPE = "sparse_vector";

static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to "
+ "contain [sparse_vector] fields, but they cannot be indexed or searched.";
static final String ERROR_MESSAGE_8X = "The [sparse_vector] field type is not supported from 8.0 to 8.10 versions.";
static final IndexVersion PREVIOUS_SPARSE_VECTOR_INDEX_VERSION = IndexVersion.V_8_0_0;

static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersion.V_8_500_001;

public static class Builder extends FieldMapper.Builder {

final Parameter<Map<String, String>> meta = Parameter.metaParam();
private final Parameter<Map<String, String>> meta = Parameter.metaParam();

public Builder(String name) {
super(name);
Expand All @@ -65,18 +72,19 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {
}

public static final TypeParser PARSER = new TypeParser((n, c) -> {
if (c.indexVersionCreated().onOrAfter(IndexVersion.V_8_0_0)) {
throw new IllegalArgumentException(ERROR_MESSAGE);
} else {
if (c.indexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
deprecationLogger.warn(DeprecationCategory.MAPPINGS, "sparse_vector", ERROR_MESSAGE_7X);
return new Builder(n);
} else if (c.indexVersionCreated().before(NEW_SPARSE_VECTOR_INDEX_VERSION)) {
throw new IllegalArgumentException(ERROR_MESSAGE_8X);
}
});

return new Builder(n);
}, notInMultiFields(CONTENT_TYPE));

public static final class SparseVectorFieldType extends MappedFieldType {

public SparseVectorFieldType(String name, Map<String, String> meta) {
super(name, false, false, false, TextSearchInfo.NONE, meta);
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
}

@Override
Expand All @@ -85,28 +93,45 @@ public String typeName() {
}

@Override
public DocValueFormat docValueFormat(String format, ZoneId timeZone) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
public Query existsQuery(SearchExecutionContext context) {
throw new IllegalArgumentException("[sparse_vector] fields do not support [exists] queries");
}

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
throw new IllegalArgumentException("[sparse_vector] fields do not support sorting, scripting or aggregating");
}

@Override
public Query existsQuery(SearchExecutionContext context) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
return SourceValueFetcher.identity(name(), context, format);
}

@Override
public Query termQuery(Object value, SearchExecutionContext context) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
return FeatureField.newLinearQuery(name(), indexedValueForSearch(value), DEFAULT_BOOST);
}

private static String indexedValueForSearch(Object value) {
if (value instanceof BytesRef) {
return ((BytesRef) value).utf8ToString();
}
return value.toString();
}
}

private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, mappedFieldType, multiFields, copyTo);
super(simpleName, mappedFieldType, multiFields, copyTo, false, null);
}

@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
}

@Override
public FieldMapper.Builder getMergeBuilder() {
return new Builder(simpleName()).init(this);
}

@Override
Expand All @@ -115,22 +140,72 @@ public SparseVectorFieldType fieldType() {
}

@Override
public void parse(DocumentParserContext context) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
protected boolean supportsParsingObject() {
return true;
}

@Override
public void parse(DocumentParserContext context) throws IOException {

// No support for indexing / searching 7.x sparse_vector field types
if (context.indexSettings().getIndexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
} else if (context.indexSettings().getIndexVersionCreated().before(NEW_SPARSE_VECTOR_INDEX_VERSION)) {
throw new UnsupportedOperationException(ERROR_MESSAGE_8X);
}

if (context.parser().currentToken() != Token.START_OBJECT) {
throw new IllegalArgumentException(
"[sparse_vector] fields must be json objects, expected a START_OBJECT but got: " + context.parser().currentToken()
);
}

String feature = null;
try {
// make sure that we don't expand dots in field names while parsing
context.path().setWithinLeafObject(true);
for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
if (token == Token.FIELD_NAME) {
feature = context.parser().currentName();
if (feature.contains(".")) {
throw new IllegalArgumentException(
"[sparse_vector] fields do not support dots in feature names but found [" + feature + "]"
);
}
} else if (token == Token.VALUE_NULL) {
// ignore feature, this is consistent with numeric fields
} else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
final String key = name() + "." + feature;
float value = context.parser().floatValue(true);
if (context.doc().getByKey(key) != null) {
throw new IllegalArgumentException(
"[sparse_vector] fields do not support indexing multiple values for the same feature ["
+ key
+ "] in the same document"
);
}
context.doc().addWithKey(key, new FeatureField(name(), feature, value));
} else {
throw new IllegalArgumentException(
"[sparse_vector] fields take hashes that map a feature to a strictly positive "
+ "float, but got unexpected token "
+ token
);
}
}
} finally {
context.path().setWithinLeafObject(false);
}
}

@Override
protected void parseCreateField(DocumentParserContext context) {
throw new IllegalStateException("parse is implemented directly");
throw new AssertionError("parse is implemented directly");
}

@Override
protected String contentType() {
return CONTENT_TYPE;
}

@Override
public FieldMapper.Builder getMergeBuilder() {
return new Builder(simpleName()).init(this);
}
}
Loading

0 comments on commit 2562894

Please sign in to comment.