# Opensearch query example

This notebook serves as a fully working Opensearch query example that we can use for discussion and development before adding the constraints of request and response models and tests.

There are various TODOs in here indicating some of the decisions that need to be made.

In [24]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np

from app.index import OpenSearchIndex
from app.ml import SBERTEncoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Setup

### 1a. Connect to Opensearch
As we're outside of docker-compose we'll connect to Opensearch via localhost.

In [2]:
opensearch = OpenSearchIndex(
    url="http://localhost:9200",
    username="admin",
    password="admin",
    index_name="navigator",
    # TODO: convert to env variables?
    opensearch_connector_kwargs={
        "use_ssl": False,
        "verify_certs": False,
        "ssl_show_warn": False,
    },
    embedding_dim=768,
)

print(opensearch.is_connected())

opns = opensearch.opns

True


### 1b. Load sentence-BERT encoder

This is used to generate embeddings for semantic search.

In [3]:
# TODO: this needs to be the same model as used for indexing. At a later stage when we start updating 
# models we may want a way of ensuring both models are the same.
enc = SBERTEncoder(model_name="msmarco-distilbert-dot-v5")
enc.encode("hello world").shape

(768,)

In [4]:
emba = enc.encode("bicycle race")
embb = enc.encode("car race")
embc = enc.encode("tortoise race")

np.dot(emba, embb), np.dot(emba, embc)

(80.994026, 76.67018)

### 2. Run search

The `run_query` function does all of the heavy lifting here. See various TODOs and [issue #420](https://github.com/climatepolicyradar/navigator/issues/420) for discussion points.

In [27]:
def _innerproduct_threshold_to_lucene_threshold(ip_thresh: float) -> float:
    """
    Opensearch documentation on mapping similarity functions to Lucene thresholds is here: https://github.com/opensearch-project/k-NN/blob/main/src/main/java/org/opensearch/knn/index/SpaceType.java#L33
    It defines 'inner product' as negative inner product i.e. a distance rather than similarity measure, so we reverse the signs of inner product here compared to the docs.
    """
    if ip_thresh > 0:
        return ip_thresh + 1
    else:
        return 1 / (1-ip_thresh)
    

def run_query(query: str, innerproduct_threshold: float, max_passages_per_doc: int) -> dict:
    embedding = enc.encode(query)
    lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)

    opns_query = {
                "size": 0, # only return aggregations
                "query": {
                    "bool": {
                        "should": [
                            # Text passage matching
                            {
                                "match": {
                                    "text": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                            {
                                "match_phrase": {
                                    "text": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                            # Text passage semantic search (KNN)
                            {
                                # TODO: setting the KNN threshold to high essentially filters out the KNN results but leaves the others in.
                                # This should be documented.
                                "function_score": {
                                    "query": {
                                        "knn": {
                                            "text_embedding": {
                                                "vector": embedding,
                                                # TODO: this k value should match above
                                                "k": 10,
                                            },
                                        },
                                    },
                                    "min_score": lucene_threshold
                                }
                            },
                            # Action (to be document) title matching
                            {
                                "match_phrase": {
                                    "name": {
                                        "query": query,
                                        "boost": 10,
                                    },
                                }
                            },
                            # {
                            #     "prefix": {
                            #         "name": {
                            #             "value": query,
                            #             "boost": 10,
                            #             "case_insensitive": True,
                            #         },
                            #     }
                            # },
                            # Action (to be document) description matching
                            {
                                "match": {
                                    "description": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                        ],
                        "minimum_should_match": 1,
                    },
                },
                "aggs": {
                    "top_docs": {
                        "terms": {
                            "field": "document_id",
                            "order": {"top_hit": "desc"},
                        },
                        "aggs": {
                            "top_passage_hits": {
                                "top_hits": {
                                    "_source": {"excludes": ["text_embedding"]},
                                    "size": max_passages_per_doc,
                                }
                            },
                            "top_hit": {"max": {"script": {"source": "_score"}}},
                        },
                    }
                },
            }
    
    start = time.time()
    response = opns.search(
        body=opns_query,
        index="navigator",
        request_timeout=30,
        preference="prototype_user", # TODO: document what this means
        explain=True,
    )
    end = time.time()
    print(f"query execution time: {round(end-start, 2)}s")
    
    return response

# TODO: we should experimentally adjust this threshold 
run_query(
    "National Home Retrofit Scheme 2020 (One Stop Shop Development Call)", 
    innerproduct_threshold=70, # Same as prototype
    max_passages_per_doc=10
)

query execution time: 0.27s


{'took': 255,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2835, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'top_docs': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 944,
   'buckets': [{'key': '1151',
     'doc_count': 116,
     'top_hit': {'value': 192.987548828125},
     'top_passage_hits': {'hits': {'total': {'value': 116, 'relation': 'eq'},
       'max_score': 192.98755,
       'hits': [{'_index': 'navigator',
         '_type': '_doc',
         '_id': '57RI1X8B6OZKDs8-mUPu',
         '_score': 192.98755,
         '_source': {'document_name': "Scheme's guidelines",
          'country_code': 'IRL',
          'action_id': 913,
          'action_source_name': 'CCLW',
          'name': 'National Home Retrofit Scheme 2020 (One Stop Shop Development Call)',
          'action_date': '25/12/2020',
          'document_id': 1151,
          'document_language_id': 1826.0,
  