In [11]:
# %pip install rich -Uqq
# %pip install fastcore -Uqq
# %pip install nbdev -Uqq

In [12]:
from rich import inspect as rinspect
from rich import print as rprint
from fastcore.test import *
from fastcore.net import *
from fastcore.basics import *

In [13]:
from opensearchpy import OpenSearch
import json
host, port = 'localhost', 9200
auth=('admin', 'admin')

In [14]:
cli=OpenSearch(hosts=[{'host': host, 'port': port}], http_compress=True, http_auth=auth, use_ssl=True, verify_certs=False, ssl_assert_hostname=False, ssl_show_warn=False)

In [15]:
rprint(cli.cat.health.__doc__)

In [16]:
# Checks before indexing
#cli.cat.health(format='json')
rprint(cli.cat.health(v=True, h=['status', 'cluster']))
rprint(cli.cat.indices(v=True, h=['health', 'status', 'index', 'docs.count']))

## Indexing

- Create an index searchml_revisited with non-default settings 

In [17]:
rinspect(AttrDict)

In [18]:
index_name='searchml_revisited'
q_default_fld={'query': {'default_field': 'body'}}
d={'settings': {'index': q_default_fld}}
index_body=AttrDict(d)
index_body

- settings: 
  - index: 
    - query: 
      - default_field: body

In [57]:
from opensearchpy import NotFoundError

def _exists_index(index):
    try:
        return cli.cat.indices(index, h='index').strip() == index
    except NotFoundError:
        return False

if not _exists_index(index_name):
    resp = cli.indices.create(index=index_name, body=index_body); resp

In [20]:
test_eq(cli.cat.indices('searchml_revisited', h='index').strip(), 'searchml_revisited')

In [21]:
# Add our sample documents to the index.
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]

In [22]:
index_name='searchml_revisited'
resps = [cli.index(index_name, body=doc, id=doc['id'], refresh=True) 
            for doc in docs]

In [24]:
# Commeting the below as this it is not repeatable
# First time it would show created if empty index
# [test_eq(resp['result'], 'created') for resp in resps]
# If already created
# [test_eq(resp['result'], 'updated') for resp in resps]

In [25]:
test_eq(cli.cat.count(index_name, h='count').strip(), str(4))

**Be intentional with your data mapping!** You can always override your configuration or perform other computations at runtime if you need to.

In OpenSearch it is possible to explicitly define the data types and text analysis via what are called Field Mappings or simply Mappings.

In [26]:
rinspect(cli.indices.get_mapping, help=True)

In [27]:
AttrDict(cli.indices.get_mapping(index_name))

- searchml_revisited: 
  - mappings: 
    - properties: 
      - body: 
        - type: text
        - fields: 
          - keyword: 
            - type: keyword
            - ignore_above: 256
      - category: 
        - type: text
        - fields: 
          - keyword: 
            - type: keyword
            - ignore_above: 256
      - id: 
        - type: text
        - fields: 
          - keyword: 
            - type: keyword
            - ignore_above: 256
      - in_stock: 
        - type: boolean
      - price: 
        - type: text
        - fields: 
          - keyword: 
            - type: keyword
            - ignore_above: 256
      - title: 
        - type: text
        - fields: 
          - keyword: 
            - type: keyword
            - ignore_above: 256

- **Multi fields**
  - keyword field type added by opensearch to every field due to 'text'
  - aggregation will be inefficient for text based fields

- in_stock is of type boolean which can be aggregated since we have only two values.

- ignore_above set to 256. Will not index values that have more than 256 characters. This means those values will not show up in any aggregations or searches. 


- The “price” field was chosen to be “text” even though we only ever passed in numeric values. This means we may get unexpected results from sorting or range filtering, since the values will be treated like strings rather than numbers.


**Well designed index structure**

- be explicit about mappings
- use multiple fields to represent the same piece of document content. Eg: fld with multiple analyzers ( with/without stemming), [autocompletion](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/search-suggesters.html#completion-suggester), [search-as-you-type](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/search-as-you-type.html), [joins](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/parent-join.html)

Reindex if field mapping is changed after indexing.

### Analyzer

- 'text' field uses [Standard Analyzer]()
- Analyzer consistes of 3 components. 1. 0 or more [character filters] 2. [tokenizer]() 3. 0 or more [token filters]()
- Char filters : strip things like HTML tags 
- Tokenizer : split text into tokens
- Tok filters : add/update/delete tokens before handing off to lucene.
![alt text](https://corise.com/_next/image?url=https%3A%2F%2Fcorise.com%2Fstatic%2Fcourse%2Fsearch-with-machine-learning%2Fassets%2Fckyclv9qd000n14727vnn8zax%2Fimage-6.jpg&w=384&q=75 "Standard Analyzer")
- [Standard Tokenizer]() : Lucene tokenizer splitting text into tokens based on UNICODE. Not suitable for lang that does not use whitespace to delineate words (Eg: ja, zh)
- Stopwords, Stemming

In [28]:
index_name, AttrDict(index_body)

('searchml_revisited',
 - settings: 
   - index: 
     - query: 
       - default_field: body)

In [42]:
del mapping

In [48]:
# setting up a mapping using EnglishAnalyzer
mapping=AttrDict()
mapping.properties=AttrDict({'title': {}, 'body': {}, 'in_stock': {}, 'category': {}, 'price': {}})
mapping.properties.title={'type': 'text', 'analyzer': 'english'}
mapping.properties.body={'type': 'text', 'analyzer': 'english'}
mapping.properties.in_stock={'type': 'boolean'}
mapping.properties.category={'type': 'keyword', 'ignore_above': "256"}
mapping.properties.price={'type': "float"}
mapping

- properties: 
  - title: 
    - type: text
    - analyzer: english
  - body: 
    - type: text
    - analyzer: english
  - in_stock: 
    - type: boolean
  - category: 
    - type: keyword
    - ignore_above: 256
  - price: 
    - type: float

In [49]:
index_body

- settings: 
  - index: 
    - query: 
      - default_field: body

In [53]:
index_body.update({"mappings": mapping})

In [54]:
index_body

- settings: 
  - index: 
    - query: 
      - default_field: body
- mappings: 
  - properties: 
    - title: 
      - type: text
      - analyzer: english
    - body: 
      - type: text
      - analyzer: english
    - in_stock: 
      - type: boolean
    - category: 
      - type: keyword
      - ignore_above: 256
    - price: 
      - type: float

In [59]:
index_name='searchml_revisited_custom_mappings'

if not _exists_index(index_name):
    resp = cli.indices.create(index=index_name, body=index_body); resp

In [61]:
index_name='searchml_revisited_custom_mappings'
resps = [cli.index(index_name, body=doc, id=doc['id'], refresh=True) 
            for doc in docs]

In [64]:
rinspect(cli.search)
#rinspect(cli.search, help=True)

In [81]:
# Note we should have this collection earlier
# GET searchml_revisited/_search?q=body:dogs
resp = cli.search(index='searchml_revisited', params={'q': 'body:dogs'});rprint(resp)

In [82]:
test_eq(resp['hits']['total']['value'], 1)

In [83]:

# GET searchml_revisited_custom_mappings/_search?q=body:dogs
resp=cli.search(index='searchml_revisited_custom_mappings', params={'q': 'body:dogs'});rprint(resp)

In [84]:
test_eq(resp['hits']['total']['value'], 2)