# Week 1: OpenSearch Revisted

walkthrough from week1

In [1]:
from opensearchpy import OpenSearch

In [2]:
host = 'localhost'
port = 9200
auth = ('admin', 'admin')

In [5]:
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=False,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

client.cat.health()

'1665358539 23:35:39 docker-cluster red 1 1 true 10 10 0 1 7 0 - 55.6%\n'

In [6]:
client.cat.indices()

'yellow open security-auditlog-2022.10.09              Eit_wMwaQRyOIb-DIiqk1A 1 1   307 0 411.2kb 411.2kb\nyellow open search_fun_bulk                           UIGKJqTQSZe2dSuSToVbFg 1 1     4 0   8.7kb   8.7kb\ngreen  open .kibana_92668751_admin_1                  PvTMugZRRg-DTF0k8sVnBw 1 0     1 0   5.1kb   5.1kb\nyellow open search_fun_revisited_custom_mappings      82pgK1jGSMy_bz282BS_9Q 1 1     0 0    208b    208b\ngreen  open opensearch_dashboards_sample_data_flights y8mZJN5YSE2jFQz2LDGqCA 1 0 13059 0   5.9mb   5.9mb\nyellow open bbuy_queries                              i7TOitB_R0SVgf4--lJbcw 1 1 38501 0   8.5mb   8.5mb\nred    open bbuy_products                             GnEYQmTnSv6Xyh_mnp_q2A 1 1                        \nyellow open search_fun_test                           GAoh6hy4RTivKo5m2UcoIQ 1 1     4 1   7.4kb   7.4kb\ngreen  open .opendistro_security                      fdmbnX58RVWd5Q9HCzsKAw 1 0    10 0  69.2kb  69.2kb\ngreen  open .kibana_1                        

In [7]:
client.cat.count("search_fun_test", params={"v": "true"})

'epoch      timestamp count\n1665358542 23:35:42  4\n'

## Indexing

In [8]:
index_name = 'search_fun_revisited'
index_body = {
    'settings': {
        'index': {
            'query':{
                'default_field': "body"
            }
        }
    }
}

In [9]:
if client.indices.exists(index_name):
    client.indices.delete(index_name)
response = client.indices.create(index_name, body=index_body)
response

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'search_fun_revisited'}

### load data

In [10]:
# Add our sample document to the index.
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]

for doc in docs:
    doc_id = doc["id"]
    print("Indexing {}".format(doc_id))
    response = client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )
    print('\n\tResponse:')
    print(response)

# Verify they are in:
client.cat.count(index_name, params={"v": "true"})

Indexing doc_a

	Response:
{'_index': 'search_fun_revisited', '_id': 'doc_a', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Indexing doc_b

	Response:
{'_index': 'search_fun_revisited', '_id': 'doc_b', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Indexing doc_c

	Response:
{'_index': 'search_fun_revisited', '_id': 'doc_c', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Indexing doc_d

	Response:
{'_index': 'search_fun_revisited', '_id': 'doc_d', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}


'epoch      timestamp count\n1665358546 23:35:46  4\n'

### Indexing Performance: Bulk Indexing 

In [11]:
from opensearchpy.helpers import bulk

In [12]:
index_name = 'search_fun_bulk'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    }
}

if client.indices.exists(index_name):
    client.indices.delete(index_name)
client.indices.create(index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'search_fun_bulk'}

In [13]:
docs = [
    {
        "id": "doc_a",
        '_index': index_name,
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        '_index': index_name,
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        '_index': index_name,
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        '_index': index_name,
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]

In [14]:
bulk(client, docs)
print(client.cat.count(index_name, params={"v": "true"}))

epoch      timestamp count
1665358548 23:35:48  0



### mappings

In [15]:
client.indices.get_mapping(index_name)

{'search_fun_bulk': {'mappings': {'properties': {'body': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'category': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'in_stock': {'type': 'boolean'},
    'price': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'title': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}

In [17]:
# Redefine our docs in case they are still in scope from the bulk indexing section
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]

index_name = 'search_fun_revisited_custom_mappings'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

if client.indices.exists(index_name):
    client.indices.delete(index_name)
client.indices.create(index_name, body=index_body)



for doc in docs:
    doc_id = doc["id"]
    print("Indexing {}".format(doc_id))
    response = client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )
    print('\n\tResponse:')
    print(response)

Indexing doc_a

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_a', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Indexing doc_b

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_b', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Indexing doc_c

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_c', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Indexing doc_d

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_d', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}


## Query

In [21]:
q = 'dogs'
index_name = 'search_fun_revisited_custom_mappings'
query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': q,
      'fields': ['title^2', 'body']
    }
  }
}

client.search(
    body = query,
    index = index_name
)

{'took': 36,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 0.71833557,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 0.71833557,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_b',
    '_score': 0.6548753,
    '_source': {'id': 'doc_b',
     'title': 'Fox wins championship',
     'body': 'Wearing all red, the Fox jumped out to a lead in the race over the Dog.',
     'price': '15.13',
     'in_stock': True,
     'category': 'sports'}}]}}

In [22]:
q = 'fox dog'
query = {
  'size': 5,
  'query': {
    'match_phrase': {
      'body': {"query": q}
    }
  }
}

client.search(
    body = query,
    index = index_name
)

{'took': 34,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [23]:
q = 'fox dog'
query = {
  'size': 5,
  'query': {
    'match_phrase': {
      'body': {"query": q, "slop": 10}
    }
  }
}

client.search(
    body = query,
    index = index_name
)

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 0.39418244,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 0.39418244,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_b',
    '_score': 0.19532394,
    '_source': {'id': 'doc_b',
     'title': 'Fox wins championship',
     'body': 'Wearing all red, the Fox jumped out to a lead in the race over the Dog.',
     'price': '15.13',
     'in_stock': True,
     'category': 'sports'}}]}}

In [24]:
# try a match all query with a filter and a price factor
query = {
    'size': 5,
    'query': {
        "function_score": {
            "query": {
                "bool": {
                    "must": [
                        {"match_all": {}}
                    ],
                    "filter": [
                        {"term": {"category": "childrens"}}
                    ]
                }
            },
            "field_value_factor": {
                "field": "price",
                "missing": 1
            }
        }
    }
}

client.search(
    body=query,
    index=index_name
)

{'took': 214,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 5.99,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 5.99,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_d',
    '_score': 3.51,
    '_source': {'id': 'doc_d',
     'title': 'The Three Little Pigs Revisted',
     'price': '3.51',
     'in_stock': True,
     'body': 'The big, bad wolf huffed and puffed and blew the house down. The end.',
     'category': 'childrens'}}]}}

In [26]:
# try a match all query with a filter and a price factor
query = {
    "size": 10,
    "query": {
        "match_all": {}
    },
    "sort": [
        {
            "price": {
                "order": "asc"
            }
        }
    ]
}

client.search(
    body=query,
    index=index_name
)

{'took': 114,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_d',
    '_score': None,
    '_source': {'id': 'doc_d',
     'title': 'The Three Little Pigs Revisted',
     'price': '3.51',
     'in_stock': True,
     'body': 'The big, bad wolf huffed and puffed and blew the house down. The end.',
     'category': 'childrens'},
    'sort': [3.51]},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': None,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'},
    'sort': [5.99]},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_b',
    '_score': None,
    '_source': {'id': 'doc_b',
     'title':

### aggregation

In [27]:
import json

In [28]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "category": {
            "terms": {
                "field": "category",
                "size": 10,
                "missing": "N/A",
                "min_doc_count": 0
            }
        }
    }
}

response = client.search(
    body=query,
    index=index_name
)
print('\nSearch results:')
print(json.dumps(response, indent=4))


Search results:
{
    "took": 211,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 4,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "category": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 0,
            "buckets": [
                {
                    "key": "childrens",
                    "doc_count": 2
                },
                {
                    "key": "instructional",
                    "doc_count": 1
                },
                {
                    "key": "sports",
                    "doc_count": 1
                },
                {
                    "key": "N/A",
                    "doc_count": 0
                }
            ]
        }
    }
}


In [29]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "price": {
            "range": {
                "field": "price",
                "ranges": [
                    {
                        "to": 5
                    },
                    {
                        "from": 5,
                        "to": 20
                    },
                    {
                        "from": 20,
                    }
                ]
            }
        }
    }
}

response = client.search(
body = query,
index = index_name
)
print('\nSearch results:')
print(json.dumps(response, indent=4))


Search results:
{
    "took": 23,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 4,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "aggregations": {
        "price": {
            "buckets": [
                {
                    "key": "*-5.0",
                    "to": 5.0,
                    "doc_count": 1
                },
                {
                    "key": "5.0-20.0",
                    "from": 5.0,
                    "to": 20.0,
                    "doc_count": 2
                },
                {
                    "key": "20.0-*",
                    "from": 20.0,
                    "doc_count": 1
                }
            ]
        }
    }
}
