In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import os

# Correct path with double backslashes
env_path = os.path.join('e:\\', 'Study Space', 'Python Workspace', 'ELastic Search', '.env.local')

# Load environment variables
load_dotenv(dotenv_path=env_path)

# Print debugging information
print("Current working directory:", os.getcwd())
print("Environment file path:", env_path)

# Get the LOCALHOST variable
LOCALHOST = os.getenv('LOCALHOST')
print("Raw LOCALHOST value:", repr(LOCALHOST))

# Ensure LOCALHOST is properly processed
if not LOCALHOST:
    print("LOCALHOST not found in environment file.")
    # Fallback to default if not found
    LOCALHOST = "http://localhost:9200/"
else:
    print("LOCALHOST found in environment file.")
    # Remove quotes if present
    LOCALHOST = LOCALHOST.strip('"')

print("Processed LOCALHOST value:", repr(LOCALHOST))

# Connect to Elasticsearch
try:
    es = Elasticsearch([LOCALHOST])
    client_info = es.info()
    print('Connected to Elasticsearch!')
    pprint(client_info.body)
except Exception as e:
    print(f"Connection error: {e}")

Current working directory: e:\Study Space\Python Workspace\ELastic Search
Environment file path: e:\Study Space\Python Workspace\ELastic Search\.env.local
Raw LOCALHOST value: 'http://localhost:9200/'
LOCALHOST found in environment file.
Processed LOCALHOST value: 'http://localhost:9200/'
Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'AKPh90H1StWquQfBPE4Chw',
 'name': 'b66a5ae1a4a1',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [4]:
es.indices.delete(index='test_index', ignore_unavailable=True)

ObjectApiResponse({'acknowledged': True})

In [5]:
blog_post = {
  "title": "Elasticsearch Basics",
  "author": "John Doe",
  "content": "Elasticsearch is a search engine based on Lucene.",
  "published_date": "2024-12-16"
}
response = es.index(index='blog_index', document=blog_post)
pprint(response.body)


{'_id': 'ZFr20ZMBiPFnoaN1y4Gn',
 '_index': 'blog_index',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 1,
 'result': 'created'}


In [8]:
pprint(response)

ObjectApiResponse({'_index': 'blog_index', '_id': 'ZFr20ZMBiPFnoaN1y4Gn', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})


In [10]:
import json
json_data = json.load(open('data/blogs.json'))

for blog_post in json_data:
    print(blog_post)

{'title': 'Elasticsearch Basics', 'author': 'John Doe', 'content': 'Elasticsearch is a search engine based on Lucene.', 'published_date': '2024-12-16'}
{'title': 'Elastic Stack', 'author': 'Jane Doe', 'content': 'Elastic Stack is a collection of products that helps you to store, search, analyze, and visualize data.', 'published_date': '2024-12-17'}
{'title': 'Kibana', 'author': 'John Doe', 'content': 'Kibana is a data visualization tool that is part of the Elastic Stack.', 'published_date': '2024-12-18'}


In [12]:
def insert_documents(es, index_name = None, data = None):
    response = es.index(index=index_name, document=data)
    return response

es.indices.delete(index='blog_index', ignore_unavailable=True)
es.indices.create(index='blog_index', settings={'number_of_shards': 2, 'number_of_replicas': 2})
for blog_post in json_data:
    response = insert_documents(es, index_name='blog_index', data=blog_post)
    print(f"""Document ID: {response['_id']} is '{
        response["result"]}' and is split into {response['_shards']['total']} shards.""")

Document ID: ZloD0pMBiPFnoaN1w4FE is 'created' and is split into 3 shards.
Document ID: Z1oD0pMBiPFnoaN1w4GU is 'created' and is split into 3 shards.
Document ID: aFoD0pMBiPFnoaN1w4Gg is 'created' and is split into 3 shards.


## **When we have given 2 shards and 2 replicas but it shows 3 shards, WHY?**

The reason you are seeing 3 shards in the result even though you set `number_of_shards: 2` is due to Elasticsearch's replication mechanism.
### **Understanding Shards and Replicas:**

* `Primary Shards`: These are the main shards where the data is stored.
* `Replica Shards`: These are copies of the primary shards that provide redundancy and high availability.

When you set:

```
settings={'number_of_shards': 2, 'number_of_replicas': 2}
```

This means:

* number_of_shards = 2 → You will have 2 primary shards.
* number_of_replicas = 2 → Each primary shard will have 2 replicas.

Thus, the total shards per index are calculated as:
    **Total Shards=Primary Shards+(Primary Shards×Replicas)**


Substitute the values:
    **Total Shards=2+(2×2)=6**

### **Why Does It Show 3 Shards per Document?**

The message is referring to the shards that were involved in processing the request, not the total shards in the index. When you index a document:

1. The document is written to one primary shard.
2. Elasticsearch ensures the data is also copied to the replicas to meet the replication requirement.

The default behavior involves the following shards:

1. Primary Shard for indexing the document.
2. Replica Shards for redundancy.

Thus, 3 shards (1 primary + 2 replicas) are involved for each document write operation.
### **Final Note:**

The total shards for the index are 6, but the result shows 3 shards for each document because:

* The document was written to 1 primary shard.
* The data was replicated to 2 replica shards.

If you want fewer shards in total, you can reduce the number_of_replicas. For example:

`settings={'number_of_shards': 2, 'number_of_replicas': 1}`

This will result in:
    **Total Shards=2+(2×1)=4**

In [15]:
index_mapping = es.indices.get_mapping(index='blog_index')
pprint(index_mapping["blog_index"]["mappings"]["properties"])

{'author': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
            'type': 'text'},
 'content': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
             'type': 'text'},
 'published_date': {'type': 'date'},
 'title': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
           'type': 'text'}}
