### Connect to Elasticsearch

In [17]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

print(es.ping())

True


### Create an Index

#### What is an Index in Elasticsearch?
An index in Elasticsearch is similar to a database in a relational database system. It is a collection of documents that share similar characteristics. Each document is stored as a JSON object and has a unique identifier.

In [21]:
index_name = "test_index"

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

Index 'test_index' already exists.


In [22]:
doc = {
    "name": "John Doe",
    "age": 30,
    "occupation": "Software Engineer",
    "location": "San Francisco"
}

res = es.index(index=index_name, id=1, document=doc)
print(res)

{'_index': 'test_index', '_id': '1', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 12, '_primary_term': 3}


In [23]:
query = {
    "query": {
        "match": {
            "occupation": "Software Engineer"
        }
    }
}

res = es.search(index=index_name, body=query)
print(res)


{'took': 73, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 2.858674, 'hits': [{'_index': 'test_index', '_id': '1', '_score': 2.858674, '_source': {'name': 'John Doe', 'age': 30, 'occupation': 'Software Engineer', 'location': 'San Francisco'}}]}}


In [24]:
update_query = {
    "doc": {
        "age": 31
    }
}

es.update(index=index_name, id=1, body=update_query)

ObjectApiResponse({'_index': 'test_index', '_id': '1', '_version': 3, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 13, '_primary_term': 3})

In [25]:
es.delete(index=index_name, id=1)


ObjectApiResponse({'_index': 'test_index', '_id': '1', '_version': 4, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 14, '_primary_term': 3})

In [26]:
aggregation_query = {
    "size": 0,
    "aggs": {
        "average_age": {
            "avg": {
                "field": "age"
            }
        }
    }
}

res = es.search(index=index_name, body=aggregation_query)
print(res)

{'took': 7, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 9, 'relation': 'eq'}, 'max_score': None, 'hits': []}, 'aggregations': {'average_age': {'value': 30.444444444444443}}}


In [27]:
from elasticsearch.helpers import bulk

# Bulk inserting multiple documents
actions = [
    {"_index": index_name, "_id": i, "_source": {"name": f"User {i}", "age": 25 + i, "occupation": "Developer"}}
    for i in range(2, 10)
]

bulk(es, actions)


(8, [])

### What is a Shard in Elasticsearch?
A shard is a subdivision of an index. Each index is split into multiple shards for better performance and fault tolerance.

Primary Shards: Store the actual data.
Replica Shards: Backup copies of primary shards for redundancy.
Each shard is an independent Lucene index that can be stored on a different node in a cluster.

Example: How Shards Improve Performance
Imagine we have 1 million documents in an index. Instead of storing all of them in a single server, we can distribute them across multiple shards, which can then be placed on different servers (nodes).

Creating an Index with Shards and Replicas

In [5]:
sharded_mapping = {
    "settings": {
        "number_of_shards": 3,  # Multiple shards for parallel processing
        "number_of_replicas": 1
    },
}

if es.indices.exists(index='sharded_index'):
    es.indices.delete(index='sharded_index')  # Delete existing index to avoid conflicts
    print(f"Deleted existing index: sharded_index")

es.indices.create(index='sharded_index', body=sharded_mapping)
print(f"Created index: sharded_index")


Deleted existing index: sharded_index
Created index: sharded_index


Creating an Index with no Shards or Replicas

In [None]:
unsharded_mapping = {
    "settings": {
        "number_of_shards": 1,  # Multiple shards for parallel processing
        "number_of_replicas": 1
    },
}

if es.indices.exists(index='unsharded_index'):
    es.indices.delete(index='unsharded_index')  # Delete existing index to avoid conflicts
    print(f"Deleted existing index: unsharded_index")

es.indices.create(index='unsharded_index', body=unsharded_mapping)
print(f"Created index: unsharded_index")


Deleted existing index: unsharded_index
Created index: unsharded_index


In [None]:
import pandas as pd
from elasticsearch.helpers import bulk

df = pd.read_csv("crime_data.csv")
df.fillna("", inplace=True)

actions_unsharded_index = [
    {
        "_index": 'unsharded_index',
        "_id": row["DR_NO"],
        "_source": row.to_dict()
    }
    for _, row in df.iterrows()
]

actions_sharded_index = [
    {
        "_index": 'sharded_index',
        "_id": row["DR_NO"],
        "_source": row.to_dict()
    }
    for _, row in df.iterrows()
]

success, failed = bulk(es, actions_unsharded_index)

  df.fillna("", inplace=True)


BulkIndexError: 1 document(s) failed to index.

In [None]:
success, failed = bulk(es, actions_sharded_index)

### Mapping
Mapping is the process of defining how documents and their fields are stored and indexed in Elasticsearch.

It defines data types (e.g., text, keyword, date) and specifies how data should be interpreted.

Mapping is crucial for accurate search results, performance optimization, and data integrity.

#### Dynamic Mapping
Dynamic mapping allows Elasticsearch to automatically detect and assign data types for new fields.

This is useful for rapidly changing or unpredictable data structures.

Example :

In [None]:
import json
doc = {
    "title": "Learning Elasticsearch",
    "pages": 320,
    "price": 29.99,
    "published": "2024-03-01"
}

# Index the document
es.index(index='books', id=1, document=doc)

# View the generated mapping
mapping = es.indices.get_mapping(index='books')

# Pretty-print the JSON mapping
print("Dynamic Mapping Result:\n", mapping)

Dynamic Mapping Result:
 {'books': {'mappings': {'properties': {'pages': {'type': 'long'}, 'price': {'type': 'float'}, 'published': {'type': 'date'}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}


In [13]:
doc2 = {
    "pages": 320,
    "price": 29.99,
    "published": "2024-03-01",
    "authors": ["John Doe", "Alice Smith"]
}

# Index the document
es.index(index='books', id=2, document=doc2)

# View the generated mapping
mapping = es.indices.get_mapping(index='books')

# Pretty-print the JSON mapping
print("Dynamic Mapping Result:\n", mapping)

Dynamic Mapping Result:
 {'books': {'mappings': {'properties': {'authors': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'pages': {'type': 'long'}, 'price': {'type': 'float'}, 'published': {'type': 'date'}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}


In [15]:
# get full index 
res = es.indices.get(index='books')
print(res)

# get content of index
res = es.get(index='books', id=1)
print(res)

resq = es.get(index='books', id=2)
print(resq)

# get documents of index
res = es.search(index='books')
print(res)

{'books': {'aliases': {}, 'mappings': {'properties': {'authors': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'pages': {'type': 'long'}, 'price': {'type': 'float'}, 'published': {'type': 'date'}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'books', 'creation_date': '1741644888280', 'number_of_replicas': '1', 'uuid': 'wYW4AYnATgCUWQryrQ-pwg', 'version': {'created': '8050099'}}}}}
{'_index': 'books', '_id': '1', '_version': 6, '_seq_no': 5, '_primary_term': 2, 'found': True, '_source': {'title': 'Learning Elasticsearch', 'pages': 320, 'price': 29.99, 'published': '2024-03-01'}}
{'_index': 'books', '_id': '2', '_version': 2, '_seq_no': 7, '_primary_term': 2, 'found': True, '_source': {'pages': 320, 'price': 29.99, 'published': '2024-03-01', 'authors': ['John Doe

##### Date Detection
If date_detection is enabled (default), then new string fields are checked to see whether their contents match any of the date patterns specified in dynamic_date_formats. If a match is found, a new date field is added with the corresponding format.

The default value for dynamic_date_formats is:

In [None]:
[ "strict_date_optional_time","yyyy/MM/dd HH:mm:ss Z||yyyy/MM/dd Z"]

In [20]:
resp = es.index(
    index="datedetectionindex",
    id="1",
    document={
        "create_date": "2015/09/02"
    },
)
print(resp)

resp1 = es.indices.get_mapping(
    index="datedetectionindex",
)
print(resp1)

{'_index': 'datedetectionindex', '_id': '1', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'datedetectionindex': {'mappings': {'properties': {'create_date': {'type': 'date', 'format': 'yyyy/MM/dd HH:mm:ss||yyyy/MM/dd||epoch_millis'}}}}}


We can disable date detection by setting date_detection to false:

In [44]:
es.indices.delete(index='datedetectionindex2') 
resp = es.indices.create(
    index="datedetectionindex2",
    mappings={
        "date_detection": False
    },
)
print(resp)

resp1 = es.index(
    index="datedetectionindex2",
    id="1",
    document={
        "create_date": "2015/09/02"
    },
)
print(resp1)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'datedetectionindex2'}
{'_index': 'datedetectionindex2', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


In [45]:
mapping = es.indices.get_mapping(index='datedetectionindex2')
print(mapping)

{'datedetectionindex2': {'mappings': {'date_detection': False, 'properties': {'create_date': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}


Alternatively, the dynamic_date_formats can be customized to support your own date formats:

In [None]:
resp = es.indices.create(
    index="my-index-000001",
    mappings={
        "dynamic_date_formats": [
            "MM/dd/yyyy"
        ]
    },
)
print(resp)

resp1 = es.index(
    index="my-index-000001",
    id="1",
    document={
        "create_date": "09/25/2015"
    },
)
print(resp1)

##### NOTE:
There is a difference between configuring an array of date patterns and configuring multiple patterns in a single string separated by ||. When you configure an array of date patterns, the pattern that matches the date in the first document with an unmapped date field will determine the mapping of that field:

In [None]:
resp = es.indices.create(
    index="my-index-000002",
    mappings={
        "dynamic_date_formats": [
            "yyyy/MM",
            "MM/dd/yyyy"
        ]
    },
)

resp = es.indices.create(
    index="my-index-000003",
    mappings={
        "dynamic_date_formats": [
            "yyyy/MM||MM/dd/yyyy"
        ]
    },
)

print(resp)

resp2 = es.index(
    index="my-index-000002",
    id="1",
    document={
        "create_date": "09/25/2015"
    },
)
print(resp2)

resp3 = es.index(
    index="my-index-000003",
    id="1",
    document={
        "create_date": "2015/09"
    },
)

##### Numeric Detection
We can also enable numeric detection (which is disabled by default) by setting the field "numeric_detection" to True

#### Explicit Mapping
Explicit mapping gives you greater control by specifying field types and properties upfront.

This approach helps prevent incorrect data types and ensures consistent data handling.

In [None]:
# Define explicit mapping
explicit_mapping = {
    "mappings": {
        "properties": {
            "title": { "type": "text" },
            "author": { "type": "keyword" },
            "published": { "type": "date", "format": "yyyy-MM-dd" },
            "price": { "type": "float" },
            "tags": { "type": "keyword" }
        }
    }
}

# Create index with custom mapping
es.indices.create(index='library', body=explicit_mapping)

# Index a sample document
es.index(index='library', id=1, document={
    "title": "Mastering Elasticsearch",
    "author": "John Doe",
    "published": "2024-03-01",
    "price": 49.99,
    "tags": ["search", "elasticsearch", "big data"]
})


# Text Analysis

Elasticsearch provides powerful text analysis using analyzers, which process text for indexing and searching. It involves:

 - Character filter: Mainly used to strip off some unused characters or change some characters.
 - Tokenizer: Breaks a text into individual tokens(or words) and it does that based on certain factors(whitespace, ngram etc).
 - Token filter: It receives the tokens and then apply some filters(example changing uppercase terms to lowercase).

In [None]:


query = {
  "analyzer": "standard",
  "text": "Running quickly through the fields."
}
res = es.indices.analyze(body=query)
tokens = list(map(lambda n: n["token"], res["tokens"]))
print(tokens)





In [None]:
# Define custom analyzer
new_analyser = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": { 
          "char_filter": ["emoticons"],
          "tokenizer": "punctuation",
          "filter": ["lowercase", "english_stop"]
        }
      },
      "tokenizer": {
        "punctuation": { 
          "type": "pattern",
          "pattern": "[\\s.,!?]+"  
        }
      },
      "char_filter": {
        "emoticons": { 
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      },
      "filter": {
        "english_stop": { 
          "type": "stop",
          "stopwords": "_english_"
        }  #It removes common English stop words (e.g., "the", "and", "is", "of", etc.). After tokenisation
      }
    }
  }
}

# Delete index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with custom analyzer
es.indices.create(index=index_name, body=new_analyser)
print(f"Index '{index_name}' created successfully!")

query_new_analyser = {
  "analyzer": "my_custom_analyzer",
  "text": "I'm a :) person, and you?"
}

# Apply text analysis
res = es.indices.analyze(index=index_name, body=query_new_analyser)

# Extract and print tokens
tokens = [token["token"] for token in res["tokens"]]
print(tokens)
