Skip to content

Commit

Permalink
Add fielddata and scripting support for byte-sized vectors (#91184)
Browse files Browse the repository at this point in the history
This change adds support fielddata and subsequently scripting for byte vectors. This is a follow up to 
#90774 and completes the initial work for #89784.
  • Loading branch information
jdconrad committed Nov 10, 2022
1 parent 627b942 commit 89e0a6d
Show file tree
Hide file tree
Showing 29 changed files with 3,202 additions and 226 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/91184.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 91184
summary: Add fielddata and scripting support for byte-sized vectors
area: Vector Search
type: feature
issues: []
9 changes: 3 additions & 6 deletions docs/reference/mapping/types/dense-vector.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ The `dense_vector` field type stores dense vectors of numeric values. Dense
vector fields are primarily used for <<knn-search,k-nearest neighbor (kNN) search>>.

The `dense_vector` type does not support aggregations or sorting.
When <<dense-vector-params, `element_type`>> is `byte`
<<query-dsl-script-score-query,`script_score`>> is not supported.

You add a `dense_vector` field as an array of numeric values
based on <<dense-vector-params, `element_type`>> with
Expand Down Expand Up @@ -108,10 +106,9 @@ The following mapping parameters are accepted:
The data type used to encode vectors. The supported data types are
`float` (default) and `byte`. `float` indexes a 4-byte floating-point
value per dimension. `byte` indexes a 1-byte integer value per dimension.
`byte` requires `index` to be `true`. Using `byte` can result in a
substantially smaller index size with the trade off of lower
precision. Vectors using `byte` require dimensions with integer values
between -128 to 127, inclusive for both indexing and searching.
Using `byte` can result in a substantially smaller index size with the
trade off of lower precision. Vectors using `byte` require dimensions with
integer values between -128 to 127, inclusive for both indexing and searching.

`dims`::
(Required, integer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class org.elasticsearch.script.field.vectors.DenseVector {
DenseVector EMPTY
float getMagnitude()

# handle List<Number> and float[] arguments
# handle List<Number>, float[], and byte[] arguments
double dotProduct(Object)
double l1Norm(Object)
double l2Norm(Object)
Expand All @@ -147,23 +147,7 @@ class org.elasticsearch.script.field.vectors.DenseVector {
int size()
}

# implementation of DenseVector
class org.elasticsearch.script.field.vectors.BinaryDenseVector {
}

# implementation of DenseVector
class org.elasticsearch.script.field.vectors.KnnDenseVector {
}

class org.elasticsearch.script.field.vectors.DenseVectorDocValuesField {
DenseVector get()
DenseVector get(DenseVector)
}

# implementation of DenseVectorDocValuesField
class org.elasticsearch.script.field.vectors.KnnDenseVectorDocValuesField {
}

# implementation of DenseVectorDocValuesField
class org.elasticsearch.script.field.vectors.BinaryDenseVectorDocValuesField {
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
setup:
- skip:
version: " - 8.5.99"
reason: "support for byte vectors added in 8.6"
features: headers

- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
vector:
type: dense_vector
element_type: byte
dims: 5
indexed_vector:
type: dense_vector
element_type: byte
dims: 5
index: true
similarity: cosine

- do:
index:
index: test-index
id: "1"
body:
vector: [8, 5, -15, 1, -7]
indexed_vector: [8, 5, -15, 1, -7]

- do:
index:
index: test-index
id: "2"
body:
vector: [-1, 115, -3, 4, -128]
indexed_vector: [-1, 115, -3, 4, -128]

- do:
index:
index: test-index
id: "3"
body:
vector: [2, 18, -5, 0, -124]
indexed_vector: [2, 18, -5, 0, -124]

- do:
indices.refresh: {}

---
"Dot Product":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "dotProduct(params.query_vector, 'vector')"
params:
query_vector: [0, 111, -13, 14, -124]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "2"}
- match: {hits.hits.0._score: 28732.0}

- match: {hits.hits.1._id: "3"}
- match: {hits.hits.1._score: 17439.0}

- match: {hits.hits.2._id: "1"}
- match: {hits.hits.2._score: 1632.0}

---
"Cosine Similarity":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'vector')"
params:
query_vector: [0, 111, -13, 14, -124]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "2"}
- gte: {hits.hits.0._score: 0.995}
- lte: {hits.hits.0._score: 0.998}

- match: {hits.hits.1._id: "3"}
- gte: {hits.hits.1._score: 0.829}
- lte: {hits.hits.1._score: 0.832}

- match: {hits.hits.2._id: "1"}
- gte: {hits.hits.2._score: 0.509}
- lte: {hits.hits.2._score: 0.512}

---
"Cosine similarity with indexed vector":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "cosineSimilarity(params.query_vector, 'indexed_vector')"
params:
query_vector: [0, 111, -13, 14, -124]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "2"}
- gte: {hits.hits.0._score: 0.995}
- lte: {hits.hits.0._score: 0.998}

- match: {hits.hits.1._id: "3"}
- gte: {hits.hits.1._score: 0.829}
- lte: {hits.hits.1._score: 0.832}

- match: {hits.hits.2._id: "1"}
- gte: {hits.hits.2._score: 0.509}
- lte: {hits.hits.2._score: 0.512}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
setup:
- skip:
version: " - 8.5.99"
reason: "support for byte vectors added in 8.6"
features: headers

- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_dense_vector:
type: dense_vector
element_type: byte
dims: 5

- do:
index:
index: test-index
id: "1"
body:
my_dense_vector: [8, 5, -15, 1, -7]

- do:
index:
index: test-index
id: "2"
body:
my_dense_vector: [-1, 115, -3, 4, -128]

- do:
index:
index: test-index
id: "3"
body:
my_dense_vector: [2, 18, -5, 0, -124]

- do:
indices.refresh: {}

---
"L1 norm":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "l1norm(params.query_vector, 'my_dense_vector')"
params:
query_vector: [0, 111, -13, 14, -124]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "1"}
- match: {hits.hits.0._score: 246.0}

- match: {hits.hits.1._id: "3"}
- match: {hits.hits.1._score: 117.0}

- match: {hits.hits.2._id: "2"}
- gte: {hits.hits.2._score: 29.0}

---
"L2 norm":
- do:
headers:
Content-Type: application/json
search:
rest_total_hits_as_int: true
body:
query:
script_score:
query: {match_all: {} }
script:
source: "l2norm(params.query_vector, 'my_dense_vector')"
params:
query_vector: [0, 111, -13, 14, -124]

- match: {hits.total: 3}

- match: {hits.hits.0._id: "1"}
- gte: {hits.hits.0._score: 158.624}
- lte: {hits.hits.0._score: 158.627}

- match: {hits.hits.1._id: "3"}
- gte: {hits.hits.1._score: 94.407}
- lte: {hits.hits.1._score: 94.41}

- match: {hits.hits.2._id: "2"}
- gte: {hits.hits.2._score: 15.263}
- lte: {hits.hits.2._score: 15.266}

0 comments on commit 89e0a6d

Please sign in to comment.