# Opensearch query example

This notebook serves as a fully working Opensearch query example that we can use for discussion and development before adding the constraints of request and response models and tests.

There are various TODOs in here indicating some of the decisions that need to be made.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append('..')

In [3]:
import time
import numpy as np
from typing import Optional, List, Dict, Tuple

from app.index import OpenSearchIndex
from app.ml import SBERTEncoder

## 1. Setup

### 1a. Connect to Opensearch
As we're outside of docker-compose we'll connect to Opensearch via localhost.

In [4]:
opensearch = OpenSearchIndex(
    url="https://search-opensearch-dev-temp-nzkttufrxxmadvl37l4teezwri.eu-west-2.es.amazonaws.com",
    # url="https://search-navigator-alpha-g5fgeoght3wpmpk2jjxopbaaue.eu-west-2.es.amazonaws.com",
    username="cpr-master",
    password="fQ7c@888Etz@",
    index_name="navigator",
    # TODO: convert to env variables?
    opensearch_connector_kwargs={
        "use_ssl": True,
        "verify_certs": True,
        "ssl_show_warn": True,
    },
    embedding_dim=768,
)

print(opensearch.is_connected())

opns = opensearch.opns

True


### 1b. Load sentence-BERT encoder

This is used to generate embeddings for semantic search.

In [5]:
# TODO: this needs to be the same model as used for indexing. At a later stage when we start updating
# models we may want a way of ensuring both models are the same.
enc = SBERTEncoder(model_name="msmarco-distilbert-dot-v5")
enc.encode("hello world").shape

(768,)

In [6]:
emba = enc.encode("bicycle race")
embb = enc.encode("car race")
embc = enc.encode("tortoise race")

np.dot(emba, embb), np.dot(emba, embc)

(80.994026, 76.67018)

### 2. Run search

The `run_query` function does all of the heavy lifting here. See various TODOs and [issue #420](https://github.com/climatepolicyradar/navigator/issues/420) for discussion points.

In [7]:
def _innerproduct_threshold_to_lucene_threshold(ip_thresh: float) -> float:
    """
    Opensearch documentation on mapping similarity functions to Lucene thresholds is here: https://github.com/opensearch-project/k-NN/blob/main/src/main/java/org/opensearch/knn/index/SpaceType.java#L33
    It defines 'inner product' as negative inner product i.e. a distance rather than similarity measure, so we reverse the signs of inner product here compared to the docs.
    """
    if ip_thresh > 0:
        return ip_thresh + 1
    else:
        return 1 / (1-ip_thresh)

def _year_range_filter(year_range: Tuple[Optional[int], Optional[int]]):
    """
    Get an Opensearch filter for year range. The filter returned is between the first term of
    `year_range` and the last term, and is inclusive. Either value can be set to None to only
    apply one year constraint.
    """

    start_date = f"01/01/{year_range[0]}" if year_range[0] is not None else None
    end_date = f"31/12/{year_range[1]}" if year_range[1] is not None else None

    policy_year_conditions = {}
    if start_date is not None:
        policy_year_conditions["gte"] = start_date
    if end_date is not None:
        policy_year_conditions["lte"] = end_date

    range_filter = {"range": {}}

    range_filter["range"]["document_date"] = policy_year_conditions

    return range_filter


In [17]:
def run_query(
    query_string,
    max_passages_per_doc: int,
    keyword_filters: Optional[Dict[str, List[str]]] = None,
    year_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
    name_boost=100, 
    description_boost=40, 
    text_boost=50, 
    innerproduct_threshold=70, 
    knn_k_value=10000,
    n_passages_to_sample_per_shard=5000, 
    max_doc_count=100,
    required_fields: Optional[List[str]] = ["document_name"]
):
    # TODO: we might want to handle encoding the query string outside of the search method?
    embedding = enc.encode(query_string)
    lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)    
    opns_query = {
                "size": 0,  # only return aggregations
                "query": {
                    "bool": {
                        "should": [{
                            "bool": {
                                "should": [
                                    {
                                        "match": {
                                            "for_search_document_name": {
                                                "query": query_string,
                                            }
                                        }
                                    },
                                    {
                                        "match_phrase": {
                                            "for_search_document_name": {
                                                "query": query_string,
                                                "boost": 2,  # TODO: configure?
                                            }
                                        }
                                    },
                                ],
                                "boost": name_boost,
                            }
                        },
                        {
                            "bool": {
                                "should": [
                                    {
                                        "match": {
                                            "for_search_document_description": {
                                                "query": query_string,
                                                "boost": 3,  # TODO: configure?
                                            }
                                        }
                                    },
                                    {
                                        "function_score": {
                                            "query": {
                                                "knn": {
                                                    "document_description_embedding": {
                                                        "vector": embedding,
                                                        "k": knn_k_value,
                                                    },
                                                },
                                            },
                                            "min_score": lucene_threshold,
                                        }
                                    },
                                ],
                                "minimum_should_match": 1,
                                "boost": description_boost,
                            },
                        },
                        {
                            "bool": {
                                "should": [
                                    {
                                        "match": {
                                            "text": {
                                                "query": query_string,
                                            },
                                        }
                                    },
                                    {
                                        "function_score": {
                                            "query": {
                                                "knn": {
                                                    "text_embedding": {
                                                        "vector": embedding,
                                                        "k": knn_k_value,
                                                    },
                                                },
                                            },
                                            "min_score": lucene_threshold,
                                        }
                                    },
                                ],
                                "minimum_should_match": 1,
                                "boost": text_boost,
                            }
                        },
                    ],
                        "minimum_should_match": 1,
                    },
                },
                "aggs": {
                    "sample": {
                        "sampler": {
                            "shard_size": n_passages_to_sample_per_shard
                        },
                        "aggs": {
                            "top_docs": {
                                "terms": {
                                    "field": "document_name_and_id",
                                    "order": {"document_date.avg": "asc"},
                                    "size": max_doc_count,
                                },
                                "aggs": {
                                    "top_passage_hits": {
                                        "top_hits": {
                                            "_source": {
                                                "excludes": [
                                                    "text_embedding",
                                                    "document_description_embedding",
                                                ]
                                            },
                                            "size": max_passages_per_doc,
                                        },
                                    },
                                    "top_hit": {"max": {"script": {"source": "_score"}}},
                                    "document_date": {
                                        "stats": {
                                            "field": "document_date",
                                        },
                                    },
                                },
                            },
                        },
                    },
                    "no_unique_docs": {"cardinality": {"field": "document_name_and_id"}},
                },
            }
    if keyword_filters:
        terms_clauses = []

        for field, values in keyword_filters.items():
            terms_clauses.append({"terms": {field: values}})

        opns_query["query"]["bool"]["filter"] = terms_clauses


    if year_range:
        if "filter" not in opns_query["query"]["bool"]:
            opns_query["query"]["bool"]["filter"] = []

        opns_query["query"]["bool"]["filter"].append(
            _year_range_filter(year_range)
        )
        
    if required_fields:
        must_clause = [
            {"exists": {"field": field_name}}
            for field_name in required_fields
        ]
        
        opns_query["query"]["bool"]["must"] = must_clause


    start = time.time()
    response = opns.search(
        body=opns_query,
        index="navigator",
        request_timeout=30,
        preference="prototype_user", # TODO: document what this means
        explain=True,
    )
    end = time.time()
    print(f"query execution time: {round(end-start, 2)}s")

    return response

run_query(
    "Energy prices", 
    max_passages_per_doc=10,
    year_range=["2017", "2022"],
    
)

query execution time: 1.79s


{'took': 7,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'no_unique_docs': {'value': 519},
  'sample': {'doc_count': 5000,
   'top_docs': {'doc_count_error_upper_bound': -1,
    'sum_other_doc_count': 3934,
    'buckets': [{'key': '5-year and 20-year national development plan 12936',
      'doc_count': 16,
      'document_date': {'count': 16,
       'min': 1483228800000.0,
       'max': 1483228800000.0,
       'avg': 1483228800000.0,
       'sum': 23731660800000.0,
       'min_as_string': '01/01/2017',
       'max_as_string': '01/01/2017',
       'avg_as_string': '01/01/2017',
       'sum_as_string': '11/01/2722'},
      'top_hit': {'value': 377.43115234375},
      'top_passage_hits': {'hits': {'total': {'value': 16, 'relation': 'eq'},
        'max_score': 377.43115,
        'hits': [{'_index': 'navigator',
          '_type': '_doc'