elastic · elasticsearchmachine · Sep 6, 2023 · Aug 29, 2023 · Aug 29, 2023 · Aug 30, 2023
diff --git a/docs/changelog/98996.yaml b/docs/changelog/98996.yaml
@@ -0,0 +1,5 @@
+pr: 98996
+summary: Reintroduce `sparse_vector` mapping
+area: Mapping
+type: enhancement
+issues: []
diff --git a/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml
@@ -0,0 +1,143 @@
+---
+"Indexing and searching sparse vectors":
+
+  - skip:
+      version: " - 8.10.99"
+      reason: "sparse_vector field type reintroduced in 8.11"
+
+  - do:
+      indices.create:
+          index: test
+          body:
+            settings:
+              number_of_replicas: 0
+            mappings:
+              properties:
+                text:
+                   type: text
+                ml.tokens:
+                   type: sparse_vector
+
+  - match: { acknowledged: true }
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          text: "running is good for you"
+          ml:
+            tokens:
+              running: 2.4097164
+              good: 2.170997
+              run: 2.052153
+              race: 1.4575411
+              for: 1.1908325
+              runner: 1.1803857
+              exercise: 1.1652642
+              you: 0.9654308
+              training: 0.94999343
+              sports: 0.93650943
+              fitness: 0.83129317
+              best: 0.820365
+              bad: 0.7385934
+              health: 0.7098149
+              marathon: 0.61555296
+              gym: 0.5652374
+
+  - match: { result: "created" }
+
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          text: "walking is a healthy exercise"
+          ml:
+            tokens:
+              walking: 2.4797723
+              exercise: 2.074234
+              healthy: 1.971596
+              walk: 1.6458614
+              health: 1.5291847
+              walker: 1.4736869
+              activity: 1.0793462
+              good: 1.0597849
+              fitness: 0.91855437
+              training: 0.86342937
+              movement: 0.7657065
+              normal: 0.6694081
+              foot: 0.5892523
+              physical: 0.4926789
+
+  - match: { result: "created" }
+
+  - do:
+      indices.refresh: { }
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            bool:
+              should:
+                - term:
+                    ml.tokens:
+                      value: "walk"
+                      boost: 1.9790847
+                - term:
+                    ml.tokens:
+                      value: "walking"
+                      boost: 1.7092685
+                - term:
+                    ml.tokens:
+                      value: "exercise"
+                      boost: 0.84076905
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "2" }
+  - match: { hits.hits.1._id: "1" }
+
+---
+"Sparse vector in 7.x":
+  - skip:
+      features: allowed_warnings
+      version: "8.0.0 - "
+      reason: "sparse_vector field type supported in 7.x"
+  - do:
+      allowed_warnings:
+        - "The [sparse_vector] field type is deprecated and will be removed in 8.0."
+        - "[sparse_vector] field type in old 7.x indices is allowed to contain [sparse_vector] fields, but they cannot be indexed or searched."
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_replicas: 0
+          mappings:
+            properties:
+              text:
+                type: text
+              ml.tokens:
+                type: sparse_vector
+
+  - match: { acknowledged: true }
+
+---
+"Sparse vector in 8.x":
+  - skip:
+      version: " - 7.99.99, 8.11.0 - "
+      reason: "sparse_vector field type not supported in 8.x until 8.11.0"
+  - do:
+      catch: /The \[sparse_vector\] field type is no longer supported/
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_replicas: 0
+          mappings:
+            properties:
+              text:
+                type: text
+              ml.tokens:
+                type: sparse_vector
diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersion.java b/server/src/main/java/org/elasticsearch/index/IndexVersion.java
@@ -122,6 +122,7 @@ private static IndexVersion registerIndexVersion(int id, Version luceneVersion,
      * Detached index versions added below here.
      */
     public static final IndexVersion V_8_500_000 = registerIndexVersion(8_500_000, Version.LUCENE_9_7_0, "bf656f5e-5808-4eee-bf8a-e2bf6736ff55");
+    public static final IndexVersion V_8_500_001 = registerIndexVersion(8_500_001, Version.LUCENE_9_7_0, "45045a5a-fc57-4462-89f6-6bc04cda6015");
 
     /*
      * STOP! READ THIS FIRST! No, really,
@@ -145,7 +146,7 @@ private static IndexVersion registerIndexVersion(int id, Version luceneVersion,
      */
 
     private static class CurrentHolder {
-        private static final IndexVersion CURRENT = findCurrent(V_8_500_000);
+        private static final IndexVersion CURRENT = findCurrent(V_8_500_001);
 
         // finds the pluggable current version, or uses the given fallback
         private static IndexVersion findCurrent(IndexVersion fallback) {

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@@ -8,41 +8,52 @@
 
 package org.elasticsearch.index.mapper.vectors;
 
+import org.apache.lucene.document.FeatureField;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.logging.DeprecationCategory;
-import org.elasticsearch.common.logging.DeprecationLogger;
+import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.index.IndexVersion;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.fielddata.FieldDataContext;
+import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.mapper.DocumentParserContext;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperBuilderContext;
+import org.elasticsearch.index.mapper.SourceValueFetcher;
 import org.elasticsearch.index.mapper.TextSearchInfo;
 import org.elasticsearch.index.mapper.ValueFetcher;
 import org.elasticsearch.index.query.SearchExecutionContext;
-import org.elasticsearch.search.DocValueFormat;
+import org.elasticsearch.xcontent.XContentParser.Token;
 
-import java.time.ZoneId;
+import java.io.IOException;
 import java.util.Map;
 
+import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
+
 /**
- * A {@link FieldMapper} for indexing a sparse vector of floats.
- *
- * @deprecated The sparse_vector type was deprecated in 7.x and removed in 8.0. This mapper
- * definition only exists so that 7.x indices can be read without error.
- *
- * TODO: remove in 9.0.
+ * A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse
+ * vector of features.
  */
-@Deprecated
 public class SparseVectorFieldMapper extends FieldMapper {
-    private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(SparseVectorFieldMapper.class);
-    static final String ERROR_MESSAGE = "The [sparse_vector] field type is no longer supported.";
-    static final String ERROR_MESSAGE_7X = "The [sparse_vector] field type is no longer supported. Old 7.x indices are allowed to "
-        + "contain [sparse_vector] fields, but they cannot be indexed or searched.";
+
     public static final String CONTENT_TYPE = "sparse_vector";
 
+    static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to "
+        + "contain [sparse_vector] fields, but they cannot be indexed or searched.";
+    static final String ERROR_MESSAGE_8X = "The [sparse_vector] field type is not supported from 8.0 to 8.10 versions.";
+    static final IndexVersion PREVIOUS_SPARSE_VECTOR_INDEX_VERSION = IndexVersion.V_8_0_0;
+
+    static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersion.V_8_500_000;
+
+    private static SparseVectorFieldType ft(FieldMapper in) {
+        return ((SparseVectorFieldMapper) in).fieldType();
+    }
+
     public static class Builder extends FieldMapper.Builder {
 
-        final Parameter<Map<String, String>> meta = Parameter.metaParam();
+        private final Parameter<Map<String, String>> meta = Parameter.metaParam();
 
         public Builder(String name) {
             super(name);
@@ -65,18 +76,19 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {
     }
 
     public static final TypeParser PARSER = new TypeParser((n, c) -> {
-        if (c.indexVersionCreated().onOrAfter(IndexVersion.V_8_0_0)) {
-            throw new IllegalArgumentException(ERROR_MESSAGE);
-        } else {
+        if (c.indexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
             deprecationLogger.warn(DeprecationCategory.MAPPINGS, "sparse_vector", ERROR_MESSAGE_7X);
-            return new Builder(n);
+        } else if (c.indexVersionCreated().before(NEW_SPARSE_VECTOR_INDEX_VERSION)) {
+            throw new IllegalArgumentException(ERROR_MESSAGE_8X);
         }
-    });
+
+        return new Builder(n);
+    }, notInMultiFields(CONTENT_TYPE));
 
     public static final class SparseVectorFieldType extends MappedFieldType {
 
         public SparseVectorFieldType(String name, Map<String, String> meta) {
-            super(name, false, false, false, TextSearchInfo.NONE, meta);
+            super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
         }
 
         @Override
@@ -85,28 +97,45 @@ public String typeName() {
         }
 
         @Override
-        public DocValueFormat docValueFormat(String format, ZoneId timeZone) {
-            throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+        public Query existsQuery(SearchExecutionContext context) {
+            throw new IllegalArgumentException("[sparse_vector] fields do not support [exists] queries");
         }
 
         @Override
-        public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
-            throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+        public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
+            throw new IllegalArgumentException("[sparse_vector] fields do not support sorting, scripting or aggregating");
         }
 
         @Override
-        public Query existsQuery(SearchExecutionContext context) {
-            throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+        public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
+            return SourceValueFetcher.identity(name(), context, format);
         }
 
         @Override
         public Query termQuery(Object value, SearchExecutionContext context) {
-            throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+            return FeatureField.newLinearQuery(name(), indexedValueForSearch(value), DEFAULT_BOOST);
+        }
+
+        private static String indexedValueForSearch(Object value) {
+            if (value instanceof BytesRef) {
+                return ((BytesRef) value).utf8ToString();
+            }
+            return value.toString();
         }
     }
 
     private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, MultiFields multiFields, CopyTo copyTo) {
-        super(simpleName, mappedFieldType, multiFields, copyTo);
+        super(simpleName, mappedFieldType, multiFields, copyTo, false, null);
+    }
+
+    @Override
+    public Map<String, NamedAnalyzer> indexAnalyzers() {
+        return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
+    }
+
+    @Override
+    public FieldMapper.Builder getMergeBuilder() {
+        return new Builder(simpleName()).init(this);
     }
 
     @Override
@@ -115,22 +144,73 @@ public SparseVectorFieldType fieldType() {
     }
 
     @Override
-    public void parse(DocumentParserContext context) {
-        throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+    protected boolean supportsParsingObject() {
+        return true;
+    }
+
+    @Override
+    public void parse(DocumentParserContext context) throws IOException {
+
+        // No support for indexing / searching 7.x sparse_vector field types
+        if (context.indexSettings().getIndexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
+            throw new UnsupportedOperationException(ERROR_MESSAGE_7X);
+        } else if (context.indexSettings().getIndexVersionCreated().before(IndexVersion.V_8_11_0)) {
+            throw new UnsupportedOperationException(ERROR_MESSAGE_8X);
+        }
+
+        if (context.parser().currentToken() != Token.START_OBJECT) {
+            throw new IllegalArgumentException(
+                "[sparse_vector] fields must be json objects, expected a START_OBJECT but got: " + context.parser().currentToken()
+            );
+        }
+
+        String feature = null;
+        try {
+            // make sure that we don't expand dots in field names while parsing
+            context.path().setWithinLeafObject(true);
+            for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
+                if (token == Token.FIELD_NAME) {
+                    feature = context.parser().currentName();
+                    if (feature.contains(".")) {
+                        throw new IllegalArgumentException(
+                            "[sparse_vector] fields do not support dots in feature names but found [" + feature + "]"
+                        );
+                    }
+                } else if (token == Token.VALUE_NULL) {
+                    // ignore feature, this is consistent with numeric fields
+                } else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) {
+                    final String key = name() + "." + feature;
+                    float value = context.parser().floatValue(true);
+                    if (context.doc().getByKey(key) != null) {
+                        throw new IllegalArgumentException(
+                            "[sparse_vector] fields do not support indexing multiple values for the same "
+                                + "rank feature ["
+                                + key
+                                + "] in the same document"
+                        );
+                    }
+                    context.doc().addWithKey(key, new FeatureField(name(), feature, value));
+                } else {
+                    throw new IllegalArgumentException(
+                        "[sparse_vector] fields take hashes that map a feature to a strictly positive "
+                            + "float, but got unexpected token "
+                            + token
+                    );
+                }
+            }
+        } finally {
+            context.path().setWithinLeafObject(false);
+        }
     }
 
     @Override
     protected void parseCreateField(DocumentParserContext context) {
-        throw new IllegalStateException("parse is implemented directly");
+        throw new AssertionError("parse is implemented directly");
     }
 
     @Override
     protected String contentType() {
         return CONTENT_TYPE;
     }
 
-    @Override
-    public FieldMapper.Builder getMergeBuilder() {
-        return new Builder(simpleName()).init(this);
-    }
 }