In [1]:
import requests, json
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch import exceptions
import elasticsearch.helpers as ESH
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_colwidth', -1)

## Connect to ES

In [2]:
def pprint(data):
    print(json.dumps(data, indent=4, sort_keys=True))

In [11]:
import logging
def connect_elasticsearch():
    es = None
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if es.ping():
        print('Connected!\n')
    else:
        print('Not connected!\n')
        print("RUN: sudo service elasticsearch start")
    return es

logging.basicConfig(level=logging.ERROR)

es = connect_elasticsearch()
elastic_info = Elasticsearch.info(es)
print ("Cluster info:", json.dumps(elastic_info, indent=4 ))

Connected!

Cluster info: {
    "name": "mopbzp174193",
    "cluster_name": "elasticsearch",
    "cluster_uuid": "UDwteJ-GTXGNi_RtAU923g",
    "version": {
        "number": "7.5.1",
        "build_flavor": "oss",
        "build_type": "rpm",
        "build_hash": "3ae9ac9a93c95bd0cdc054951cf95d88e1e18d96",
        "build_date": "2019-12-16T22:57:37.835892Z",
        "build_snapshot": false,
        "lucene_version": "8.3.0",
        "minimum_wire_compatibility_version": "6.8.0",
        "minimum_index_compatibility_version": "6.0.0-beta1"
    },
    "tagline": "You Know, for Search"
}


## CREATE INDEX

In [12]:
def create_index(es_object, index_name, mapping):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": mapping,
    }
    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            response = "created"
        else:
            response = "exists"
    except Exception as ex:
        print(str(ex))
        response = "error"
        
    finally:
        return response

## Get All Indices

In [13]:
def get_all_index(es):
    try:
        if es != None:
            # returns a list of all the cluster's indices
            all_indices = es.indices.get_alias("*")

        return all_indices
    except Exception as e:
        print(e)
        return False

## DELETE INDEX

In [14]:
def delete_index(es, ind_name):
    try:
        es.indices.delete(index=ind_name, ignore=[400, 404])
        return True
    except:
        return False

## GET MAPPING

In [15]:
def get_mapping(es, index):
    try:
        mapping = es.indices.get_mapping(index)
        return True
    except Exception as e:
        print(e)
        return False

## GET ALL DOCUMENTS

In [39]:
def get_all_docs(index, sz):
    try:
        res = es.search(index=index,body = {
        'size' : sz,
        'query': {
            'match_all' : {}
        }
        })
        return res
    except Exception as e:
        print(e)
        return False

# CRUD

### INSERT

In [17]:
def set_doc(es, index_name, record):
    try:
        outcome = es.index(index=index_name,  body=record)
        response=True
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))
        response=False
        
    return response

### INSERT BULK

In [18]:
def set_doc_bulk(es, index, docs):
    try:
        n_success, n_fail = ESH.bulk(es, docs, index=index, stats_only=True)
        return n_success, n_fail
    except Exception as e:
        print(e)
        return False

### UPDATE

In [19]:
def update_doc(es, index, body, _id):
    try:
        es.update(index=index,id=_id, body=body, retry_on_conflict=5)
        return True
    except Exception as e:
        print(e)
        return False

### DELETE

In [20]:
def delete_doc(es, index, _id):
    try:
        res=es.delete(index=index,id=_id)
        return True
    except Exception as e:
        print(e)
        return False

## SEARCH

#### i- QUERY

In [21]:
def search(es_object, index_name, search):
    try:
        res = es_object.search(index=index_name, body=search)
        return res
    except Exception as e:
        print(e)
        return False

In [22]:
query={
  "query": {
    "query_string": {
      "query": "*star*",
      "analyzer": "snowball",
      "fields": ["title"],
      "default_operator": "and"
    }
  }
}
    
r=search(es, "movies", query)
pd.DataFrame(r["hits"]["hits"])


Unnamed: 0,_index,_type,_id,_score,_source
0,movies,_doc,NstVM28B_MBVO-3Hn8xA,1.0,"{'id': '135569', 'title': 'Star Trek Beyond', 'year': 2016, 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
1,movies,_doc,N8tVM28B_MBVO-3Hn8xA,1.0,"{'id': '122886', 'title': 'Star Wars: Episode VII - The Force Awakens', 'year': 2015, 'genre': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'IMAX']}"


## EXAMPLE-1: DENORMALIZED INDEX

In [34]:
# create index
mapping={
    "properties": {
        "year": {
            "type": "date",
        },
        "genre": {
            "type": "keyword", # no analyzers applied, exact-matches ll'be returned for this field
        },
        "title": {
            "type": "text", # analyzers applied
            "analyzer": "english",
        }
    }
}
create_index(es, "movies", mapping)

'exists'

In [None]:
# delete_index(es, "series")

In [36]:
record={
    "genre": ["IMAX", "ACTION"],
    "title": "6 Underground",
    "year": 2019
}
set_doc(es, "movies", record)


Error in indexing data
AuthorizationException(403, 'cluster_block_exception', 'index [movies] blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];')


False

In [41]:
doc1={ "id": "135569", "title" : "Star Trek Beyond", "year":2016 , "genre":["Action", "Adventure", "Sci-Fi"] }

doc2={ "id": "122886", "title" : "Star Wars: Episode VII - The Force Awakens", "year":2015 , "genre":["Action", "Adventure", "Fantasy", "Sci-Fi", "IMAX"] }

doc3={ "id": "109487", "title" : "Interstellar", "year":2014 , "genre":["Sci-Fi", "IMAX"] }

doc4={ "id": "58559", "title" : "Dark Knight, The", "year":2008 , "genre":["Action", "Crime", "Drama", "IMAX"] }

doc5={ "id": "1924", "title" : "Plan 9 from Outer Space", "year":1959 , "genre":["Horror", "Sci-Fi"] }

docs = [ doc1, doc2, doc3, doc4, doc5 ]

In [42]:
body={"doc": 
        {
            "genre": ["Action","Adventure"],
            "title": "6 Underground",
            "id": 135580,
        }
    }
update_doc(es, "movies", body, "Nd_XCG8B67ayOr6nWmi_")

AuthorizationException(403, 'cluster_block_exception', 'index [movies] blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];')


False

## EXAMPLE-2: PARENT/CHILD INDEX

In [43]:
delete_index(es, "series")

True

In [44]:
# create index

mapping={
  "mappings": {
    "properties": {
      "genre": {
        "type": "keyword"
      },
      "film_to_franchise": { 
        "type": "join",
        "relations": {
          # "question": "answer"
          "franchise": "film"
        }
      }
    }
  }
}

create_index(es, "series", mapping)

'created'

In [45]:
d=[{'id': '1', 'film_to_franchise': {'name': 'franchise'}, 'title': 'Star Wars'},
 {'id': '260',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode IV - A New Hope',
  'year': '1977',
  'genre': ['Action', 'Adventure', 'Sci-Fi']},
 {'id': '1196',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode V - The Empire Strikes Back',
  'year': '1980',
  'genre': ['Action', 'Adventure', 'Sci-Fi']},
 {'id': '1210',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode VI - Return of the Jedi',
  'year': '1983',
  'genre': ['Action', 'Adventure', 'Sci-Fi']},
 {'id': '2628',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode I - The Phantom Menace',
  'year': '1999',
  'genre': ['Action', 'Adventure', 'Sci-Fi']},
 {'id': '5378',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode II - Attack of the Clones',
  'year': '2002',
  'genre': ['Action', 'Adventure', 'Sci-Fi', 'IMAX']},
 {'id': '33493',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode III - Revenge of the Sith',
  'year': '2005',
  'genre': ['Action', 'Adventure', 'Sci-Fi']},
 {'id': '122886',
  'film_to_franchise': {'name': 'film', 'parent': '1'},
  'title': 'Star Wars: Episode VII - The Force Awakens',
  'year': '2015',
  'genre': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'IMAX']}]



passed, failed = set_doc_bulk(es, "series", d)
print(passed, failed)
pd.DataFrame(d)

8 0


Unnamed: 0,id,film_to_franchise,title,year,genre
0,1,{'name': 'franchise'},Star Wars,,
1,260,"{'name': 'film', 'parent': '1'}",Star Wars: Episode IV - A New Hope,1977.0,"[Action, Adventure, Sci-Fi]"
2,1196,"{'name': 'film', 'parent': '1'}",Star Wars: Episode V - The Empire Strikes Back,1980.0,"[Action, Adventure, Sci-Fi]"
3,1210,"{'name': 'film', 'parent': '1'}",Star Wars: Episode VI - Return of the Jedi,1983.0,"[Action, Adventure, Sci-Fi]"
4,2628,"{'name': 'film', 'parent': '1'}",Star Wars: Episode I - The Phantom Menace,1999.0,"[Action, Adventure, Sci-Fi]"
5,5378,"{'name': 'film', 'parent': '1'}",Star Wars: Episode II - Attack of the Clones,2002.0,"[Action, Adventure, Sci-Fi, IMAX]"
6,33493,"{'name': 'film', 'parent': '1'}",Star Wars: Episode III - Revenge of the Sith,2005.0,"[Action, Adventure, Sci-Fi]"
7,122886,"{'name': 'film', 'parent': '1'}",Star Wars: Episode VII - The Force Awakens,2015.0,"[Action, Adventure, Fantasy, Sci-Fi, IMAX]"


In [46]:
pd.DataFrame(get_all_docs("series", 1000)["hits"]["hits"])

Unnamed: 0,_index,_type,_id,_score,_source
0,series,_doc,8n7FWHABs3C7qiFqPPoX,1.0,"{'id': '1', 'film_to_franchise': {'name': 'franchise'}, 'title': 'Star Wars'}"
1,series,_doc,837FWHABs3C7qiFqPPoY,1.0,"{'id': '260', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode IV - A New Hope', 'year': '1977', 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
2,series,_doc,9H7FWHABs3C7qiFqPPoY,1.0,"{'id': '1196', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode V - The Empire Strikes Back', 'year': '1980', 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
3,series,_doc,9X7FWHABs3C7qiFqPPoY,1.0,"{'id': '1210', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode VI - Return of the Jedi', 'year': '1983', 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
4,series,_doc,9n7FWHABs3C7qiFqPPoY,1.0,"{'id': '2628', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode I - The Phantom Menace', 'year': '1999', 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
5,series,_doc,937FWHABs3C7qiFqPPoY,1.0,"{'id': '5378', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode II - Attack of the Clones', 'year': '2002', 'genre': ['Action', 'Adventure', 'Sci-Fi', 'IMAX']}"
6,series,_doc,-H7FWHABs3C7qiFqPPoY,1.0,"{'id': '33493', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode III - Revenge of the Sith', 'year': '2005', 'genre': ['Action', 'Adventure', 'Sci-Fi']}"
7,series,_doc,-X7FWHABs3C7qiFqPPoY,1.0,"{'id': '122886', 'film_to_franchise': {'name': 'film', 'parent': '1'}, 'title': 'Star Wars: Episode VII - The Force Awakens', 'year': '2015', 'genre': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'IMAX']}"


## SEARCH

**filters**: ask a yes/no question of your data

**queries**: return data in terms of relevance

Use filters when you can, they are fast and cacheable.

### 1) Boolean query with filter

Movie which must have “”trek” in title and its year is gte to 2010.

In BOOL query we can combine multiple queries,
MUST is equivalent to AND in ES

In [49]:
query={
    "query": {
        "bool": {
            "must": {"term": {"title": "trek"}},
            "filter": {"range": {"year": {"gte": 2010}}}
        }
    }
}
r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "NstVM28B_MBVO-3Hn8xA",
                "_index": "movies",
                "_score": 1.5169398,
                "_source": {
                    "genre": [
                        "Action",
                        "Adventure",
                        "Sci-Fi"
                    ],
                    "id": "135569",
                    "title": "Star Trek Beyond",
                    "year": 2016
                },
                "_type": "_doc"
            }
        ],
        "max_score": 1.5169398,
        "total": {
            "relation": "eq",
            "value": 1
        }
    },
    "timed_out": false,
    "took": 41
}


BOOL QUERY and BOOL FILTER are 2 different thing, which both have must, must_not and should operator.
Instead of filtering out the results that don’t meet the BOOLEAN condition, it actually scores the result by relevance.

Queries are wrapped in query:{} block
Filters are wrapped in filter:{} block

We can nest them inside each other.

Filters can combine to do more complex things

### 2) PHRASE SEARCH

Phrase matching: must find all terms in right order

### 3) SLOP

SLOP: order maters but you’re ok with some words being in between the terms.
Slop represents how far you’re willing to let a term move to satisfy a phrase(in either direction).

Another example: “Quick brown fox” would match “quick fox” with a slop of 1.


PROXIMITY QUERIES:
Remember this is a query, results are sorted by relevance.
Just use a really high slop if you want to get any documents that contains the words in your phrase but want document that have the words closer together scored higher.
query:{
	match_phrase:{
		title:{quert:star beyond, slop:100}
	}
}


Match: e.g.  “Star Wars”, match query will treat this as a seprate words and will return movies which have star and wars in them in title.

Match_phrase will treat it like a phrase and return movies which have “star wars” phrase in the title.

### 4) Pagination
Specify “from” and “size”

*WARNING*

- Deep pagination can kill performance
- Every result muse be retreived, collected and sorted.
- Enforce an upper bound on how many result you'll return users.

In [50]:
query={
    "from": 0,
    "size": 4,
    "query": {"match": {"genre": "Sci-Fi"}}
}

r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "OMtVM28B_MBVO-3Hn8xA",
                "_index": "movies",
                "_score": 0.640912,
                "_source": {
                    "genre": [
                        "Sci-Fi",
                        "IMAX"
                    ],
                    "id": "109487",
                    "title": "Interstellar",
                    "year": 2014
                },
                "_type": "_doc"
            },
            {
                "_id": "OstVM28B_MBVO-3Hn8xA",
                "_index": "movies",
                "_score": 0.640912,
                "_source": {
                    "genre": [
                        "Horror",
                        "Sci-Fi"
                    ],
                    "id": "1924",
                    "title": "Plan 9 from Outer Space",
                    "

### SORTING

INT: 
Simple for integer fields, sort results on any integer fields

STR: 
A string that is analysed(partial matches/fuzzy queries) for full-text search can’t be used to sort documents.
  
  This is bcz it exists in the inverted index as individual terms, not as entire string.

#### To sorting on analysed field

- Map a keyword copy by defining a sub-field that is not analysed.  
- You cannot change the mappings on existing index.
- You’d have to delete it, setup a new mapping, and re-index it.
- Like the number of shards, this is something you should think about before importing data into your index.

In [51]:
# mappings={
#     "mappings":{
#         "properties":{
#             "title":{
#                 "type": "text",
#                 "fields": {
#                     "raw": {"type": "keyword"}
#                 }
#             }
#         }
#     }
# }

mappings={"mappings" : {
      "properties" : {
        "title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
      }
    }
 }
delete_index(es,"movies")
create_index(es, "movies", mapping)
passed, failed = set_doc_bulk(es, "movies", docs)
passed, failed
# pprint(mappings)

(5, 0)

In [52]:
# url=127.0.0.1:9200/movies/_search?sort=title.raw&pretty
query={
    
    "query":{
        "bool":{
            "must":{"match": {"genre": "Sci-Fi"}},
            "filter":{"range":{"year":{"lt": 2016}}},
        } 
    },
    "sort":[
#         {"year": {"order": "desc"}}
        {"title.keyword": {"order": "desc"}}
    ],
}

r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "-37FWHABs3C7qiFqmvo9",
                "_index": "movies",
                "_score": null,
                "_source": {
                    "genre": [
                        "Action",
                        "Adventure",
                        "Fantasy",
                        "Sci-Fi",
                        "IMAX"
                    ],
                    "id": "122886",
                    "title": "Star Wars: Episode VII - The Force Awakens",
                    "year": 2015
                },
                "_type": "_doc",
                "sort": [
                    "Star Wars: Episode VII - The Force Awakens"
                ]
            },
            {
                "_id": "_n7FWHABs3C7qiFqmvo9",
                "_index": "movies",
                "_score": null,
                "_sourc

### FUZZY MATCHES

- To handle typos or Mis-spellings in ES
- The Levenshtein edit distance accounts for:
    - substitutions
    - insertions
    - deletion  
    
- fuzziness: AUTO:
    - 0 for 1-2 chars string
    - 1 for 3-5 chars string    
    - 2 for anything else    

In [53]:
query={
    "query":{
        "fuzzy":{
            "title": {"value": "intrsteller", "fuzziness": 2}
        }
    }
}
r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "_H7FWHABs3C7qiFqmvo9",
                "_index": "movies",
                "_score": 1.6236734,
                "_source": {
                    "genre": [
                        "Sci-Fi",
                        "IMAX"
                    ],
                    "id": "109487",
                    "title": "Interstellar",
                    "year": 2014
                },
                "_type": "_doc"
            }
        ],
        "max_score": 1.6236734,
        "total": {
            "relation": "eq",
            "value": 1
        }
    },
    "timed_out": false,
    "took": 55
}


#### Partial Matches

In [54]:
query={
    "query":{
        "prefix":{
            "title": "int"
        }
    }
}
r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "_H7FWHABs3C7qiFqmvo9",
                "_index": "movies",
                "_score": 1.0,
                "_source": {
                    "genre": [
                        "Sci-Fi",
                        "IMAX"
                    ],
                    "id": "109487",
                    "title": "Interstellar",
                    "year": 2014
                },
                "_type": "_doc"
            }
        ],
        "max_score": 1.0,
        "total": {
            "relation": "eq",
            "value": 1
        }
    },
    "timed_out": false,
    "took": 7
}


## Auto complete and suggestions

QUERY-TIME SEARCH AS YOU TYPE

In [55]:
query={
    "query":{
        "match_phrase_prefix":{
            "title":{
                "query": "star trek",
                "slop": 10
            }
        }
    }
}

r=search(es, "movies", query)
pprint(r)

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "-n7FWHABs3C7qiFqmvo9",
                "_index": "movies",
                "_score": 2.4749134,
                "_source": {
                    "genre": [
                        "Action",
                        "Adventure",
                        "Sci-Fi"
                    ],
                    "id": "135569",
                    "title": "Star Trek Beyond",
                    "year": 2016
                },
                "_type": "_doc"
            }
        ],
        "max_score": 2.4749134,
        "total": {
            "relation": "eq",
            "value": 1
        }
    },
    "timed_out": false,
    "took": 21
}


## N-GRAMS

N-GRAMS:
"star"

- unigram: [s,t,a,r]
- bigram:  [st,ta,ar]
- trigram: [sta,tar]
- 4-gram:  [star]

EDGE N-GRAMS: 
"star"

These are built only on the beginning of each terms e.g.
  For STAR  the edge n-grams will be:
- unigram: s
- bigram:   st
- trigram:   sta
- 4-gram:   star

Indexing n-grams

1) Create an auto-coplete analyzer

In [56]:
mapping={
    "settings":{
        "analysis":{
            "filter":{
                "autocomplete_filter":{
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 20,
                }
            },
            "analyzer":{
                "autocomplete":{
                    "type":"custom",
                    "tokenizer": "standard",
                    "filter":[
                        "lowercase",
                        "autocomplete_filter"
                    ]
                }
            }
        }
    }
}
delete_index(es,"movies")
r=create_index(es, "movies", mapping)
passed, failed = set_doc_bulk(es, "movies", docs)
r,passed, failed

('created', 5, 0)