# Opensearch query example

This notebook serves as a fully working Opensearch query example that we can use for discussion and development before adding the constraints of request and response models and tests.

There are various TODOs in here indicating some of the decisions that need to be made.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append('/home/stefan/PycharmProjects/navigator/search-index')
sys.path.append('/home/stefan/PycharmProjects/navigator/common')
sys.path.append('/home/stefan/PycharmProjects/navigator/backend/app/core')

In [3]:

import time
import numpy as np
from typing import Optional, List, Dict, Tuple

from app.index import OpenSearchIndex
from app.ml import SBERTEncoder

## 1. Setup

### 1a. Connect to Opensearch
As we're outside of docker-compose we'll connect to Opensearch via localhost.

In [4]:
opensearch = OpenSearchIndex(
    url="https://search-navigator-alpha-g5fgeoght3wpmpk2jjxopbaaue.eu-west-2.es.amazonaws.com",
    username="cpr-master",
    password="fQ7c@888Etz@",
    index_name="navigator",
    # TODO: convert to env variables?
    opensearch_connector_kwargs={
        "use_ssl": False,
        "verify_certs": False,
        "ssl_show_warn": False,
    },
    embedding_dim=768,
)

print(opensearch.is_connected())

opns = opensearch.opns

True


### 1b. Load sentence-BERT encoder

This is used to generate embeddings for semantic search.

In [5]:
# TODO: this needs to be the same model as used for indexing. At a later stage when we start updating
# models we may want a way of ensuring both models are the same.
enc = SBERTEncoder(model_name="msmarco-distilbert-dot-v5")
enc.encode("hello world").shape

Ignored unknown kwarg option direction


(768,)

In [6]:
emba = enc.encode("bicycle race")
embb = enc.encode("car race")
embc = enc.encode("tortoise race")

np.dot(emba, embb), np.dot(emba, embc)

Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction


(80.99403, 76.67019)

### 2. Run search

The `run_query` function does all of the heavy lifting here. See various TODOs and [issue #420](https://github.com/climatepolicyradar/navigator/issues/420) for discussion points.

In [7]:
def _innerproduct_threshold_to_lucene_threshold(ip_thresh: float) -> float:
    """
    Opensearch documentation on mapping similarity functions to Lucene thresholds is here: https://github.com/opensearch-project/k-NN/blob/main/src/main/java/org/opensearch/knn/index/SpaceType.java#L33
    It defines 'inner product' as negative inner product i.e. a distance rather than similarity measure, so we reverse the signs of inner product here compared to the docs.
    """
    if ip_thresh > 0:
        return ip_thresh + 1
    else:
        return 1 / (1-ip_thresh)

def _year_range_filter(year_range: Tuple[Optional[int], Optional[int]]):
    """
    Get an Opensearch filter for year range. The filter returned is between the first term of
    `year_range` and the last term, and is inclusive. Either value can be set to None to only
    apply one year constraint.
    """

    start_date = f"01/01/{year_range[0]}" if year_range[0] is not None else None
    end_date = f"31/12/{year_range[1]}" if year_range[1] is not None else None

    policy_year_conditions = {}
    if start_date is not None:
        policy_year_conditions["gte"] = start_date
    if end_date is not None:
        policy_year_conditions["lte"] = end_date

    range_filter = {"range": {}}

    range_filter["range"]["document_date"] = policy_year_conditions

    return range_filter


# def run_query(
#     query: str,
#     innerproduct_threshold: float,
#     max_passages_per_doc: int,
#     keyword_filters: Optional[Dict[str, List[str]]] = None,
#     year_range: Optional[Tuple[Optional[int], Optional[int]]] = None
# ) -> dict:
#     """
#     Run an Opensearch query.

#     Args:
#         query (str): query string
#         innerproduct_threshold (float): threshold applied to KNN results
#         max_passages_per_doc (int): maximum number of passages to return per document
#         keyword_filters (Optional[Dict[str, List[str]]]): filters on keyword values to apply.
#         In the format `{"field_name": ["values", ...], ...}`. Defaults to None.
#         year_range (Optional[Tuple[Optional[int], Optional[int]]]): filter on action year by (minimum, maximum).
#         Either value can be set to `None` for a one-sided filter.

#     Returns:
#         dict: raw Opensearch result.
#     """
#     # TODO: we might want to handle encoding the query string outside of the search method?
#     embedding = enc.encode(query)
#     lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)

#     opns_query = {
#                 "size": 0, # only return aggregations
#                 "query": {
#                     "bool": {
#                         "should": [
#                             # Text passage matching
#                             {
#                                 "match": {
#                                     "text": {
#                                         "query": query,
#                                         "boost": 1,
#                                     },
#                                 }
#                             },
#                             {
#                                 "match_phrase": {
#                                     "text": {
#                                         "query": query,
#                                         "boost": 1,
#                                     },
#                                 }
#                             },
#                             # Text passage semantic search (KNN)
#                             {
#                                 # TODO: setting the KNN threshold to high essentially filters out the KNN results but leaves the others in.
#                                 # This should be documented.
#                                 "function_score": {
#                                     "query": {
#                                         "knn": {
#                                             "text_embedding": {
#                                                 "vector": embedding,
#                                                 # TODO: this k value should match above
#                                                 "k": 10000,
#                                             },
#                                         },
#                                     },
#                                     "min_score": lucene_threshold
#                                 }
#                             },
#                             # Action (to be document) title matching
#                             {
#                                 "match_phrase": {
#                                     "document_name": {
#                                         "query": query,
#                                         "boost": 100,
#                                     },
#                                 }
#                             },
#                             # {
#                             #     "prefix": {
#                             #         "name": {
#                             #             "value": query,
#                             #             "boost": 10,
#                             #             "case_insensitive": True,
#                             #         },
#                             #     }
#                             # },
#                             # Action (to be document) description matching
#                             {
#                                 "match": {
#                                     "document_description": {
#                                         "query": query,
#                                         "boost": 10,
#                                     },
#                                 }
#                             },
#                         ],
#                         "minimum_should_match": 1,
#                     },
#                 },
#                 "aggs": {
#                     "top_docs": {
#                         "terms": {
#                             "field": "document_id",
#                             "order": {"top_hit": "desc"},
#                         },
#                         "aggs": {
#                             "top_passage_hits": {
#                                 "top_hits": {
#                                     "_source": {"excludes": ["text_embedding"]},
#                                     "size": max_passages_per_doc,
#                                 }
#                             },
#                             "top_hit": {"max": {"script": {"source": "_score"}}},
#                         },
#                     }
#                 },
#             }

#     if keyword_filters:
#         terms_clauses = []

#         for field, values in keyword_filters.items():
#             terms_clauses.append({"terms": {field: values}})

#         opns_query["query"]["bool"]["filter"] = terms_clauses


#     if year_range:
#         if "filter" not in opns_query["query"]["bool"]:
#             opns_query["query"]["bool"]["filter"] = []

#         opns_query["query"]["bool"]["filter"].append(
#             _year_range_filter(year_range)
#         )


#     start = time.time()
#     response = opns.search(
#         body=opns_query,
#         index="navigator",
#         request_timeout=30,
#         preference="prototype_user", # TODO: document what this means
#         explain=True,
#     )
#     end = time.time()
#     print(f"query execution time: {round(end-start, 2)}s")

#     return response

# def run_query_updated(query_string,
#                          max_passages_per_doc: int,
#     keyword_filters: Optional[Dict[str, List[str]]] = None,
#     year_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
#                      name_boost=100, description_boost=10, innerproduct_threshold=70, knn_k_value=10000):
#         # TODO: we might want to handle encoding the query string outside of the search method?
#     embedding = enc.encode(query_string)
#     lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)
#     opns_query = {
#                 "size": 0, # only return aggregations
#                 "query": {
#                     "bool": {
#                         "should":
            
#     [
#                 {
#                     "bool": {
#                         "should": [
#                             {
#                                 "match": {
#                                     "for_search_document_name": {
#                                         "query": query_string,
#                                     }
#                                 }
#                             },
#                             {
#                                 "match_phrase": {
#                                     "for_search_document_name": {
#                                         "query": query_string,
#                                         "boost": 2,  # TODO: configure?
#                                     }
#                                 }
#                             },
#                         ],
#                         "boost": name_boost,
#                     }
#                 },
#                 {
#                     "bool": {
#                         "should": [
#                             {
#                                 "match": {
#                                     "for_search_document_description": {
#                                         "query": query_string,
#                                         "boost": 3,  # TODO: configure?
#                                     }
#                                 }
#                             },
#                             {
#                                 "function_score": {
#                                     "query": {
#                                         "knn": {
#                                             "document_description_embedding": {
#                                                 "vector": embedding,
#                                                 "k": knn_k_value,
#                                             },
#                                         },
#                                     },
#                                     "min_score": lucene_threshold,
#                                 }
#                             },
#                         ],
#                         "minimum_should_match": 1,
#                         "boost": description_boost,
#                     },
#                 },
#                 {
#                     "bool": {
#                         "should": [
#                             {
#                                 "match": {
#                                     "text": {
#                                         "query": query_string,
#                                     },
#                                 }
#                             },
#                             {
#                                 "function_score": {
#                                     "query": {
#                                         "knn": {
#                                             "text_embedding": {
#                                                 "vector": embedding,
#                                                 "k": knn_k_value,
#                                             },
#                                         },
#                                     },
#                                     "min_score": lucene_threshold,
#                                 }
#                             },
#                         ],
#                         "minimum_should_match": 1,
#                     }
#                 },
#             ],
#                         "minimum_should_match": 1,
#                     },
#                 },
#                 "aggs": {
#                     "top_docs": {
#                         "terms": {
#                             "field": "document_id",
#                             "order": {"top_hit": "desc"},
#                         },
#                         "aggs": {
#                             "top_passage_hits": {
#                                 "top_hits": {
#                                     "_source": {"excludes": ["text_embedding"]},
#                                     "size": max_passages_per_doc,
#                                 }
#                             },
#                             "top_hit": {"max": {"script": {"source": "_score"}}},
#                         },
#                     }
#                 },
#             }
#     if keyword_filters:
#         terms_clauses = []

#         for field, values in keyword_filters.items():
#             terms_clauses.append({"terms": {field: values}})

#         opns_query["query"]["bool"]["filter"] = terms_clauses


#     if year_range:
#         if "filter" not in opns_query["query"]["bool"]:
#             opns_query["query"]["bool"]["filter"] = []

#         opns_query["query"]["bool"]["filter"].append(
#             _year_range_filter(year_range)
#         )


#     start = time.time()
#     response = opns.search(
#         body=opns_query,
#         index="navigator",
#         request_timeout=30,
#         preference="prototype_user", # TODO: document what this means
#         explain=True,
#     )
#     end = time.time()
#     print(f"query execution time: {round(end-start, 2)}s")

#     return response

# # # TODO: we should experimentally adjust this threshold
# # run_query(
# #     "cycle to work",
# #     innerproduct_threshold=70, # Same as prototype
# #     max_passages_per_doc=10,
# # #     year_range=(2000, None),
# # #     keyword_filters={
# # #         "country_code": ["KEN"]
# # #     }
# # )

# run_query_updated("blue hydrogen", max_passages_per_doc=1)

In [13]:
def run_query_updated(query_string,
                         max_passages_per_doc: int,
    keyword_filters: Optional[Dict[str, List[str]]] = None,
    year_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
                     name_boost=100, description_boost=10, innerproduct_threshold=70, knn_k_value=10000,
                     n_passages_to_sample_per_shard=5000, max_doc_count=100):
        # TODO: we might want to handle encoding the query string outside of the search method?
    embedding = enc.encode(query_string)
    lucene_threshold = _innerproduct_threshold_to_lucene_threshold(innerproduct_threshold)    
    opns_query = {
                "size": 0,  # only return aggregations
                "query": {
                    "bool": {
                        "should": [{
                    "bool": {
                        "should": [
                            {
                                "match": {
                                    "for_search_document_name": {
                                        "query": query_string,
                                    }
                                }
                            },
                            {
                                "match_phrase": {
                                    "for_search_document_name": {
                                        "query": query_string,
                                        "boost": 2,  # TODO: configure?
                                    }
                                }
                            },
                        ],
                        "boost": name_boost,
                    }
                },
                {
                    "bool": {
                        "should": [
                            {
                                "match": {
                                    "for_search_document_description": {
                                        "query": query_string,
                                        "boost": 3,  # TODO: configure?
                                    }
                                }
                            },
                            {
                                "function_score": {
                                    "query": {
                                        "knn": {
                                            "document_description_embedding": {
                                                "vector": embedding,
                                                "k": knn_k_value,
                                            },
                                        },
                                    },
                                    "min_score": lucene_threshold,
                                }
                            },
                        ],
                        "minimum_should_match": 1,
                        "boost": description_boost,
                    },
                },
                {
                    "bool": {
                        "should": [
                            {
                                "match": {
                                    "text": {
                                        "query": query_string,
                                    },
                                }
                            },
                            {
                                "function_score": {
                                    "query": {
                                        "knn": {
                                            "text_embedding": {
                                                "vector": embedding,
                                                "k": knn_k_value,
                                            },
                                        },
                                    },
                                    "min_score": lucene_threshold,
                                }
                            },
                        ],
                        "minimum_should_match": 1,
                    }
                },
            ],
                        "minimum_should_match": 1,
                    },
                },
                "aggs": {
                    "sample": {
                        "sampler": {
                            "shard_size": n_passages_to_sample_per_shard
                        },
                        "aggs": {
                            "top_docs": {
                                "terms": {
                                    "field": "document_name_and_id",
                                    "order": {"top_hit": "desc"},
                                    "size": max_doc_count,
                                },
                                "aggs": {
                                    "top_passage_hits": {
                                        "top_hits": {
                                            "_source": {
                                                "excludes": [
                                                    "text_embedding",
                                                    "document_description_embedding",
                                                ]
                                            },
                                            "size": max_passages_per_doc,
                                        },
                                    },
                                    "top_hit": {"max": {"script": {"source": "_score"}}},
                                    "document_date": {
                                        "stats": {
                                            "field": "document_date",
                                        },
                                    },
                                },
                            },
                        },
                    },
                    "no_unique_docs": {"cardinality": {"field": "document_name_and_id"}},
                },
            }
    if keyword_filters:
            terms_clauses = []

            for field, values in keyword_filters.items():
                terms_clauses.append({"terms": {field: values}})

            opns_query["query"]["bool"]["filter"] = terms_clauses


    if year_range:
        if "filter" not in opns_query["query"]["bool"]:
            opns_query["query"]["bool"]["filter"] = []

        opns_query["query"]["bool"]["filter"].append(
            _year_range_filter(year_range)
        )


    start = time.time()
    response = opns.search(
        body=opns_query,
        index="navigator",
        request_timeout=30,
        preference="prototype_user", # TODO: document what this means
        explain=True,
    )
    end = time.time()
    print(f"query execution time: {round(end-start, 2)}s")

    return response

run_query_updated("green hydrogen", max_passages_per_doc=1)

Ignored unknown kwarg option direction
query execution time: 0.4s


{'took': 296,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'no_unique_docs': {'value': 540},
  'sample': {'doc_count': 5000,
   'top_docs': {'doc_count_error_upper_bound': -1,
    'sum_other_doc_count': 1268,
    'buckets': [{'key': 'basic hydrogen strategy 308',
      'doc_count': 136,
      'document_date': {'count': 136,
       'min': 1483228800000.0,
       'max': 1483228800000.0,
       'avg': 1483228800000.0,
       'sum': 201719116800000.0,
       'min_as_string': '01/01/2017',
       'max_as_string': '01/01/2017',
       'avg_as_string': '01/01/2017',
       'sum_as_string': '24/03/8362'},
      'top_hit': {'value': 643.255615234375},
      'top_passage_hits': {'hits': {'total': {'value': 136, 'relation': 'eq'},
        'max_score': 643.2556,
        'hits': [{'_index': 'navigator',
          '_type': '_doc',
          '_id'