In [2]:
import json
from opensearchpy import OpenSearch
from pprint import pprint

host = 'localhost'

host = 'localhost'
port = 9200
base_url = "https://{}:{}/".format(host, port)
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.
index_name = "bbuy_products"
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

In [3]:
pprint(client.indices.get_mapping('searchml_ltr'))

{'searchml_ltr': {'mappings': {'properties': {'body': {'analyzer': 'english',
                                                       'type': 'text'},
                                              'category': {'ignore_above': 256,
                                                           'type': 'keyword'},
                                              'id': {'fields': {'keyword': {'ignore_above': 256,
                                                                            'type': 'keyword'}},
                                                     'type': 'text'},
                                              'in_stock': {'type': 'boolean'},
                                              'price': {'type': 'float'},
                                              'title': {'analyzer': 'english',
                                                        'type': 'text'}}}}}


In [16]:
index_name = 'search_fun_revisited'
index_body = {
  'settings': {
    'index': {
      'query':{
          'default_field': "body"
      }
    }
  }
}

response = client.indices.create(index_name, body=index_body)
print('\nCreating index:')
print(response)


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'search_fun_revisited'}


In [10]:
index_name = 'search_fun_revisited_custom_mappings'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

client.indices.delete(index_name, ignore_unavailable=True)
client.indices.create(index_name, body=index_body)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'search_fun_revisited_custom_mappings'}

In [17]:


docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisited",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"},
    {
        "id": "doc_e",
        "title": "Pigs in a Blanket and Other Recipes",
        "price": "27.50",
        "in_stock": True,
        "body": "Pigs in a blanket aren't as cute as you would think given it's a food and not actual pigs wrapped in blankets.",
        "category": "instructional"},
    {
        "id": "doc_f",
        "title": "Dogs are the best",
        "body": "Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!",
        "price": "50.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_g",
        "title": "Dog",
        "body": "Dogs rule",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_h",
        "title": "Dog: The bounty hunter: living in the red",
        "body": "Dog is a bounty hunter who goes on pretend missions with his friends, one of whom is the Fox",
        "price": "125.99",
        "in_stock": True,
        "category": "sports"},
]
for doc in docs:
    doc_id = doc["id"]
    print("Indexing {}".format(doc_id))
    response = client.index(
        index='search_fun_revisited_custom_mappings',
        body=doc,
        id=doc_id,
        refresh=True
    )
    print('\n\tResponse:')
    print(response)
    
for doc in docs:
    doc_id = doc["id"]
    print("Indexing {}".format(doc_id))
    response = client.index(
        index='search_fun_revisited',
        body=doc,
        id=doc_id,
        refresh=True
    )
    print('\n\tResponse:')
    print(response)

Indexing doc_a

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_a', '_version': 2, 'result': 'updated', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 8, '_primary_term': 1}
Indexing doc_b

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_b', '_version': 2, 'result': 'updated', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 9, '_primary_term': 1}
Indexing doc_c

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_c', '_version': 2, 'result': 'updated', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 10, '_primary_term': 1}
Indexing doc_d

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_d', '_version': 2, 'result': 'updated', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed

In [20]:
#QUERING

q = 'dogs'

query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': q
    }
  }
}

pprint(client.search(
    body = query,
    index = 'search_fun_revisited'
))



{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'doc_g',
                    '_index': 'search_fun_revisited',
                    '_score': 1.4700978,
                    '_source': {'body': 'Dogs rule',
                                'category': 'childrens',
                                'id': 'doc_g',
                                'in_stock': True,
                                'price': '5.99',
                                'title': 'Dog'},
                    '_type': '_doc'},
                   {'_id': 'doc_a',
                    '_index': 'search_fun_revisited',
                    '_score': 1.1129589,
                    '_source': {'body': 'The quick red fox jumped over the '
                                        'lazy brown dogs.',
                                'category': 'childrens',
                                'id': 'doc_a',
                                'in_stock': True,
                                'p

In [21]:
pprint(client.search(
    body = query,
    index = 'search_fun_revisited_custom_mappings'
))

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'doc_g',
                    '_index': 'search_fun_revisited_custom_mappings',
                    '_score': 0.7342377,
                    '_source': {'body': 'Dogs rule',
                                'category': 'childrens',
                                'id': 'doc_g',
                                'in_stock': True,
                                'price': '5.99',
                                'title': 'Dog'},
                    '_type': '_doc'},
                   {'_id': 'doc_f',
                    '_index': 'search_fun_revisited_custom_mappings',
                    '_score': 0.7156082,
                    '_source': {'body': 'Dogs beat cats every day of the week '
                                        'and twice on Sunday. A dog is always '
                                        'up for doing something.  Since there '
                                        'are so many 

In [22]:
q = 'dogs'
index_name = 'search_fun_revisited_custom_mappings'
query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': q,
      'fields': ['title^2', 'body']
    }
  }
}

client.search(
    body = query,
    index = index_names
)

#The field “title” is twice as important for matching in this query compared to body, thanks to the “^2” factor in the “fields” attribute. 

{'took': 8, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 2.5972693, 'hits': [{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_g', '_score': 2.5972693, '_source': {'id': 'doc_g', 'title': 'Dog', 'body': 'Dogs rule', 'price': '5.99', 'in_stock': True, 'category': 'childrens'}}, {'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_f', '_score': 2.1871743, '_source': {'id': 'doc_f', 'title': 'Dogs are the best', 'body': 'Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!', 'price': '50.99', 'in_stock': True, 'category': 'childrens'}}, {'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_h', '_score': 1.484154, '_source': {'id': 'doc_h', 'title': 'Dog: The bounty hunter: living in the red', 

In [23]:
q = 'fox dog'
query = {
  'size': 5,
  'query': {
    'match_phrase': {
      'body': {"query": q}
    }
  }
}

client.search(
    body = query,
    index = index_name
)

{'took': 15, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}

Notice that we didn’t return any results! This is due to phrase queries requiring that the tokens fox and dog occur next to each other. (👀 Recall our earlier discussion about capturing positional information? That’s how OpenSearch and Lucene know how to match phrase queries.) There are a number of approaches we can take to solving matching multiple terms in a query. For starters, we can use a boolean query (or one of it’s more advanced variants like “dis_max”) and simply look for documents that have both terms (equivalent to ANDing or ORing the two terms together) or we can execute what is called a “sloppy” phrase query:   

This query says “find all documents where the terms “fox” and “dog” occur within 10 positions of each other.” Running this yields:

In [25]:
#try a phrase query with slop
q = 'fox dog'
query = {
  'size': 5,
  'query': {
    'match_phrase': {
      'body': {"query": q, "slop":10}
    }
  }
}

client.search(
    body = query,
    index = index_name
)

{'took': 8, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 0.45066714, 'hits': [{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_a', '_score': 0.45066714, '_source': {'id': 'doc_a', 'title': 'Fox and Hounds', 'body': 'The quick red fox jumped over the lazy brown dogs.', 'price': '5.99', 'in_stock': True, 'category': 'childrens'}}, {'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_b', '_score': 0.22647524, '_source': {'id': 'doc_b', 'title': 'Fox wins championship', 'body': 'Wearing all red, the Fox jumped out to a lead in the race over the Dog.', 'price': '15.13', 'in_stock': True, 'category': 'sports'}}]}}

This type of phrase query is more expensive than a boolean AND, but it does have the benefit that the closer the terms in the phrase are to each other, the higher they will score.

Let’s do two more types of queries and then move on: filtering queries and function queries. Filter queries are non-scoring queries that reduce the result set by simply determining what documents match the filter query, and function queries use the values within a field as a scoring feature. Filter queries can be used to implement features like faceting or “search within a search”. Function queries allow us to do things like boost documents based on some external value like price, inventory or popularity. Let’s combine these two ideas into a single bigger query by finding all documents in our example set where the category is “childrens” and we boost by the price:

In [35]:
# try a match all query with a filter and a price factor
query = {
    'size': 5,
    'query': {
        "function_score": {
            "query": {
                "bool": {
                    "must": [
                        {"match_all": {}}
                    ],
                    "filter": [
                        {"term": {"category": "childrens"}},
                        {"term": {"in_stock":True}}
                    ]
                }
            },
            "field_value_factor": {
                "field": "price",
                "missing": 1
            }
        }
    }
}

client.search(
    body=query,
    index=index_name
)

{'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 4, 'relation': 'eq'}, 'max_score': 50.99, 'hits': [{'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_f', '_score': 50.99, '_source': {'id': 'doc_f', 'title': 'Dogs are the best', 'body': 'Dogs beat cats every day of the week and twice on Sunday. A dog is always up for doing something.  Since there are so many dog breeds, there is a dog for everyone!', 'price': '50.99', 'in_stock': True, 'category': 'childrens'}}, {'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_a', '_score': 5.99, '_source': {'id': 'doc_a', 'title': 'Fox and Hounds', 'body': 'The quick red fox jumped over the lazy brown dogs.', 'price': '5.99', 'in_stock': True, 'category': 'childrens'}}, {'_index': 'search_fun_revisited_custom_mappings', '_type': '_doc', '_id': 'doc_g', '_score': 5.99, '_source': {'id': 'doc_g', 'title': 'Dog', 'bo

Notice in these results that the score is equivalent to the price! That’s because we issued a query where the only scoring factor was the function value score. Both the “match_all” and the filter query you see in the example here are what are called non-scoring queries. This is a very common pattern in search applications and often has very positive performance implications. Can you think of some use cases in your own application where it might come in handy? 🤔

👀 Aside: If you only need a field for ranking, you might consider using the Rank Feature field and query for improved performance.

Before we leave querying, take time to familiarize yourself with the many different query types you can issue to OpenSearch via the documentation. Also note one really important aspect of the Elastic query DSL: queries are composable in many places! That is, you can often build a more sophisticated query by adding and grouping different types of queries via things like the “bool” query. You can also mix and match many of the other query types like geo, shapes, spans, and terms!



### Agregations

Last, but not least, on our tour of OpenSearch is how to use aggregations. Let’s dive in by looking at a few types of aggregations (Elastic’s docs are, as usual, more complete) using our Python client.

To start, let’s do some basic bucketing/counting of fields and their terms:

In [41]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    }
}

query = {
    'size': 0,  # esto es para que no traiga ningun documento en la salida
    'query': {
        "match_all": {} #esto es para que agrupe sobre todos los docs
    },
    'aggs': {
        "category": {
            "terms": {
                "field": "category",
                "size": 10,
                "missing": "N/A",
                "min_doc_count": 0
            }
        }
    }
}


response = client.search(
    body=query,
    index=index_name
)
print('\nSearch results:')
print(json.dumps(response, indent=4))



Search results:
{
    "took": 1,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 8,
            "relation": "eq"
        },
        "max_score": 1.0,
        "hits": [
            {
                "_index": "search_fun_revisited_custom_mappings",
                "_type": "_doc",
                "_id": "doc_a",
                "_score": 1.0,
                "_source": {
                    "id": "doc_a",
                    "title": "Fox and Hounds",
                    "body": "The quick red fox jumped over the lazy brown dogs.",
                    "price": "5.99",
                    "in_stock": true,
                    "category": "childrens"
                }
            }
        ]
    }
}


In this example, you see a few things in action:

We are executing a simple match_all query which means our aggregations will be calculated over all documents. We told OpenSearch not to return any hits (size=0). These two things together are a common pattern in aggregation-driven applications like dashboards.

The “terms'' aggregation creates a bucket for each unique term in this field. In our case, there are only 3 unique terms, so this is an inexpensive calculation. Higher cardinality fields (e.g. an “author” field for an index of all books in the world”) will be more expensive to aggregate on.

Even though we aren’t returning any hits, the query is still executed to generate a result set against which to calculate aggregations.

We specified “missing” so we could find out how many documents don’t have a value set for this field. This is a great way to check data quality – specifically field coverage – on your content. In our examples, all documents have a category field filled in.

Terms aggregations are one of the main workhorses of many search websites, but there are many other types of aggregations that can be done. Take a moment to familiarize yourself with them via the Elastic documentation. Let’s finish our look at aggregations by working through how we might aggregate the “price” field.



In [42]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "price": {
            "terms": {
                "field": "price",
                "size": 10,
                "min_doc_count": 0
            }
        }
    }
}

response = client.search(
    body=query,
    index=index_name
)
print('\nSearch results:')
print(json.dumps(response, indent=4))


Search results:
{
    "took": 10,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 8,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "price": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": 5.989999771118164,
                    "doc_count": 2
                },
                {
                    "key": 3.509999990463257,
                    "doc_count": 1
                },
                {
                    "key": 15.130000114440918,
                    "doc_count": 1
                },
                {
                    "key": 27.5,
                    "doc_count": 1
                },
                {
                    "key": 50.9900016784668,
       

In [43]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "price": {
            "range": {
                "field": "price",
                "ranges": [
                    {
                        "to": 5
                    },
                    {
                        "from": 5,
                        "to": 20
                    },
                    {
                        "from": 20,
                    }
                ]
            }
        }
    }
}

response = client.search(
body = query,
index = index_name
)
print('\nSearch results:')
print(json.dumps(response, indent=4))


Search results:
{
    "took": 6,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 8,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "price": {
            "buckets": [
                {
                    "key": "*-5.0",
                    "to": 5.0,
                    "doc_count": 1
                },
                {
                    "key": "5.0-20.0",
                    "from": 5.0,
                    "to": 20.0,
                    "doc_count": 3
                },
                {
                    "key": "20.0-*",
                    "from": 20.0,
                    "doc_count": 4
                }
            ]
        }
    }
}


While we picked arbitrary sized ranges so that we could show different counts in the buckets, you can imagine this type of structure is more useful to users since it groups common price points together. (How might you implement a common ecommerce facet of “$”, “$$”, “$$$” and “$$$$” using ranges?) Notice, also, that for the lower range (less than 5) and the upper range (greater than 20), we left off, respectively, the “from” and the “to” attributes so as to include all prices below and above those values.