# Opensearch query example

This notebook serves as a fully working Opensearch query example that we can use for discussion and development before adding the constraints of request and response models and tests.

There are various TODOs in here indicating some of the decisions that need to be made.

In [5]:
%load_ext autoreload
%autoreload 2

import time
import numpy as np
from typing import Optional, List, Dict, Tuple

from app.index import OpenSearchIndex
from app.ml import SBERTEncoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Setup

### 1a. Connect to Opensearch
As we're outside of docker-compose we'll connect to Opensearch via localhost.

In [7]:
opensearch = OpenSearchIndex(
    url="http://localhost:9200",
    username="admin",
    password="admin",
    index_name="navigator",
    # TODO: convert to env variables?
    opensearch_connector_kwargs={
        "use_ssl": False,
        "verify_certs": False,
        "ssl_show_warn": False,
    },
    embedding_dim=768,
)

print(opensearch.is_connected())

opns = opensearch.opns

True


### 1b. Load sentence-BERT encoder

This is used to generate embeddings for semantic search.

In [8]:
# TODO: this needs to be the same model as used for indexing. At a later stage when we start updating 
# models we may want a way of ensuring both models are the same.
enc = SBERTEncoder(model_name="msmarco-distilbert-dot-v5")
enc.encode("hello world").shape

(768,)

In [9]:
emba = enc.encode("bicycle race")
embb = enc.encode("car race")
embc = enc.encode("tortoise race")

np.dot(emba, embb), np.dot(emba, embc)

(80.994026, 76.67018)

### 2. Run search

The `run_query` function does all of the heavy lifting here. See various TODOs and [issue #420](https://github.com/climatepolicyradar/navigator/issues/420) for discussion points.

In [23]:
def _innerproduct_threshold_to_lucene_threshold(ip_thresh: float) -> float:
    """
    Opensearch documentation on mapping similarity functions to Lucene thresholds is here: https://github.com/opensearch-project/k-NN/blob/main/src/main/java/org/opensearch/knn/index/SpaceType.java#L33
    It defines 'inner product' as negative inner product i.e. a distance rather than similarity measure, so we reverse the signs of inner product here compared to the docs.
    """
    if ip_thresh > 0:
        return ip_thresh + 1
    else:
        return 1 / (1-ip_thresh)
    
def _year_range_filter(year_range: Tuple[Optional[int], Optional[int]]):
    """
    Get an Opensearch filter for year range. The filter returned is between the first term of
    `year_range` and the last term, and is inclusive. Either value can be set to None to only
    apply one year constraint.
    """

    start_date = f"01/01/{year_range[0]}" if year_range[0] is not None else None
    end_date = f"31/12/{year_range[1]}" if year_range[1] is not None else None

    policy_year_conditions = {}
    if start_date is not None:
        policy_year_conditions["gte"] = start_date
    if end_date is not None:
        policy_year_conditions["lte"] = end_date

    range_filter = {"range": {}}

    range_filter["range"]["action_date"] = policy_year_conditions

    return range_filter
    

def run_query(
    query: str, 
    innerproduct_threshold: float, 
    max_passages_per_doc: int, 
    keyword_filters: Optional[Dict[str, List[str]]] = None, 
    year_range: Optional[Tuple[Optional[int], Optional[int]]] = None
) -> dict:
    """
    Run an Opensearch query.
    
    Args:
        query (str): query string
        innerproduct_threshold (float): threshold applied to KNN results
        max_passages_per_doc (int): maximum number of passages to return per document
        keyword_filters (Optional[Dict[str, List[str]]]): filters on keyword values to apply.
        In the format `{"field_name": ["values", ...], ...}`. Defaults to None.
        year_range (Optional[Tuple[Optional[int], Optional[int]]]): filter on action year by (minimum, maximum). 
        Either value can be set to `None` for a one-sided filter.
    
    Returns:
        dict: raw Opensearch result.
    """
    # TODO: we might want to handle encoding the query string outside of the search method?
    embedding = enc.encode(query)
    lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)

    opns_query = {
                "size": 0, # only return aggregations
                "query": {
                    "bool": {
                        "should": [
                            # Text passage matching
                            {
                                "match": {
                                    "text": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                            {
                                "match_phrase": {
                                    "text": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                            # Text passage semantic search (KNN)
                            {
                                # TODO: setting the KNN threshold to high essentially filters out the KNN results but leaves the others in.
                                # This should be documented.
                                "function_score": {
                                    "query": {
                                        "knn": {
                                            "text_embedding": {
                                                "vector": embedding,
                                                # TODO: this k value should match above
                                                "k": 10,
                                            },
                                        },
                                    },
                                    "min_score": lucene_threshold
                                }
                            },
                            # Action (to be document) title matching
                            {
                                "match_phrase": {
                                    "name": {
                                        "query": query,
                                        "boost": 10,
                                    },
                                }
                            },
                            # {
                            #     "prefix": {
                            #         "name": {
                            #             "value": query,
                            #             "boost": 10,
                            #             "case_insensitive": True,
                            #         },
                            #     }
                            # },
                            # Action (to be document) description matching
                            {
                                "match": {
                                    "description": {
                                        "query": query,
                                        "boost": 1,
                                    },
                                }
                            },
                        ],
                        "minimum_should_match": 1,
                    },
                },
                "aggs": {
                    "top_docs": {
                        "terms": {
                            "field": "document_id",
                            "order": {"top_hit": "desc"},
                        },
                        "aggs": {
                            "top_passage_hits": {
                                "top_hits": {
                                    "_source": {"excludes": ["text_embedding"]},
                                    "size": max_passages_per_doc,
                                }
                            },
                            "top_hit": {"max": {"script": {"source": "_score"}}},
                        },
                    }
                },
            }
    
    if keyword_filters:
        terms_clauses = []

        for field, values in keyword_filters.items():
            terms_clauses.append({"terms": {field: values}})

        opns_query["query"]["bool"]["filter"] = terms_clauses

    
    if year_range:
        if "filter" not in opns_query["query"]["bool"]:
            opns_query["query"]["bool"]["filter"] = []

        opns_query["query"]["bool"]["filter"].append(
            _year_range_filter(year_range)
        )

    
    start = time.time()
    response = opns.search(
        body=opns_query,
        index="navigator",
        request_timeout=30,
        preference="prototype_user", # TODO: document what this means
        explain=True,
    )
    end = time.time()
    print(f"query execution time: {round(end-start, 2)}s")
    
    return response

# TODO: we should experimentally adjust this threshold 
run_query(
    "National Home Retrofit Scheme 2020 (One Stop Shop Development Call)", 
    innerproduct_threshold=70, # Same as prototype
    max_passages_per_doc=10,
    year_range=(2000, None),
    keyword_filters={
        "country_code": ["KEN"]
    }
)

query execution time: 0.05s


{'took': 38,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 159, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'top_docs': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': '1315',
     'doc_count': 74,
     'top_hit': {'value': 8.16714859008789},
     'top_passage_hits': {'hits': {'total': {'value': 74, 'relation': 'eq'},
       'max_score': 8.167149,
       'hits': [{'_index': 'navigator',
         '_type': '_doc',
         '_id': 'X7RI1X8B6OZKDs8-okpl',
         '_score': 8.167149,
         '_source': {'document_name': 'Full text (pdf)',
          'country_code': 'KEN',
          'action_id': 1032,
          'action_source_name': 'CCLW',
          'text_block_id': 'p12_b326',
          'action_date': '09/07/2016',
          'text': 'one shall be nominated by a national body \nrepresenting community forest associations;',
          'document_id': 13