In [1]:
%%bash
pip install elasticsearch==8.5.2 numpy

Collecting elasticsearch==8.5.2
  Downloading elasticsearch-8.5.2-py3-none-any.whl (385 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.3/385.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting numpy
  Downloading numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elastic-transport<9,>=8 (from elasticsearch==8.5.2)
  Downloading elastic_transport-8.11.0-py3-none-any.whl.metadata (3.5 kB)
Downloading numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading elastic_transport-8.11.0-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.8/59.8 kB[0m [

In [2]:
from elasticsearch import Elasticsearch, helpers
import numpy as np

In [3]:
client = Elasticsearch(hosts='http://elasticsearch:9200')
client

<Elasticsearch(['http://elasticsearch:9200'])>

In [46]:
INDEX_NAME = "vector-index"
N_DIMS = 2

if client.indices.exists(index=INDEX_NAME).body:
    client.indices.delete(index=INDEX_NAME)
    print("index deleted!")

client.indices.create(
    index=INDEX_NAME,
    settings={
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-types.html
    mappings={
        "properties": {
            "name": {
                "type": "keyword"
            },
            "value": {
                "type": "long"
            },
            "vector": {
                # https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params
                "type": "dense_vector",
                "dims": N_DIMS,
                "index": True, # when you use approximate knn, keep False to significantly improve indexing speed
                "similarity": "cosine"
            }
        }
    }
)

index deleted!


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector-index'})

In [47]:
docs = [
    {
        "_index": INDEX_NAME,
        "_source": {
            "name": str(i),
            "value": i,
            "vector": np.random.rand(N_DIMS).tolist()
        }
    }
    for i in range(50)
]
res = helpers.bulk(client, docs)
res

(50, [])

In [48]:
# search test: Basic

response = client.search(index="vector-index", size=1)
response["hits"]["hits"][0]

{'_index': 'vector-index',
 '_id': 'I-se8owBu7_hAk1Ejdcn',
 '_score': 1.0,
 '_source': {'name': '0',
  'value': 0,
  'vector': [0.3718906891612188, 0.2088208801410606]}}

In [79]:
# search test: Approximate kNN
# https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#knn-similarity-search

response = client.search(
    index="vector-index",
    knn={
        "field": "vector",
        "query_vector": np.random.rand(N_DIMS).tolist(),
        "k": 10,
        "num_candidates": 100,
        "filter": {
            "bool" : {
                "filter" : {
                  "range" : {"value" : { "lte": 5 }}
                }
            }
        }
    },
    source_includes=["name", "vector"],
    # fields=["name", "vector"],
)
print("result:", response["hits"]["total"])
print("max:", response["hits"]["max_score"])

for obj in response["hits"]["hits"]:
    print(obj["_score"], obj['_source'])

result: {'value': 6, 'relation': 'eq'}
max: 0.99985754
0.99985754 {'name': '5', 'vector': [0.8537696417742492, 0.664062133932174]}
0.99481416 {'name': '2', 'vector': [0.9851137218806424, 0.591550115816482]}
0.99251133 {'name': '0', 'vector': [0.3718906891612188, 0.2088208801410606]}
0.93329597 {'name': '4', 'vector': [0.733561040786174, 0.12023735760251242]}
0.93143773 {'name': '3', 'vector': [0.20893348959824387, 0.5618820324723092]}
0.8891889 {'name': '1', 'vector': [0.20155002196109328, 0.9590155010604668]}


In [69]:
# search test: Exact, brute-force kNN

query = {
    "script_score": {
        # "query": {"match_all": {}},
        "query" : {
            "bool" : {
                "filter" : {
                  "range" : {"value" : { "gte": 25 }}
                }
            }
        },
        "script": {
            "source": "(cosineSimilarity(params.queryVector, 'vector') + 1) / 2",
            "params": {"queryVector": np.random.rand(N_DIMS).tolist()}
        }
    }
}

response = client.search(
    index="vector-index",
    query=query,
    size=10,
    source_includes=["name", "vector"],
    # fields=["name", "vector"],
)
print("result:", response["hits"]["total"])
print("max:", response["hits"]["max_score"])

for obj in response["hits"]["hits"]:
    print(obj["_score"], obj['_source'])

result: {'value': 25, 'relation': 'eq'}
max: 0.9999603
0.9999603 {'name': '44', 'vector': [0.008809141187231528, 0.6949595087693148]}
0.99973714 {'name': '26', 'vector': [0.028083398437410922, 0.8636727207368365]}
0.99758077 {'name': '40', 'vector': [0.029972109738166552, 0.30333338066093274]}
0.99246377 {'name': '31', 'vector': [0.14111045026077274, 0.803158292966446]}
0.9920347 {'name': '33', 'vector': [0.13841690735579237, 0.7658192449866671]}
0.9771153 {'name': '41', 'vector': [0.28827658527392896, 0.9195358731988713]}
0.9663541 {'name': '34', 'vector': [0.37537263144608435, 0.9706070020075324]}
0.95907885 {'name': '43', 'vector': [0.32355965546215826, 0.7496310258282273]}
0.94053614 {'name': '36', 'vector': [0.34108133469575264, 0.6352491695686251]}
0.90907675 {'name': '47', 'vector': [0.6596271298718076, 0.9384123138535823]}
