In [1]:
print(f"ELK Upgrade Index Validation Logic -> Validate index count for Source/Dest ES cluster")

ELK Upgrade Index Validation Logic -> Validate index count for Source/Dest ES cluster


### Validate for the documents between two clusters via Reindex script
* Compare indices between two clusters

In [2]:
from elasticsearch import Elasticsearch
import os
import json
import pandas as pd
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")

In [3]:
''' pip install python-dotenv'''
load_dotenv() # will search for .env file in local folder and load variables 

True

In [4]:
def get_headers():
    ''' Elasticsearch Header '''
    return {
            'Content-type': 'application/json', 
            'Authorization' : '{}'.format(os.getenv('BASIC_AUTH')),
            # 'Connection': 'close'
    }

In [5]:
def get_es_instance(host):
    es_client = Elasticsearch(hosts="{}".format(host), headers=get_headers(), timeout=5,  verify_certs=False)
    return es_client

''' Source cluster '''
es_obj_s_client = get_es_instance("http://localhost:9201")
''' Target cluster '''
es_obj_t_client = get_es_instance("http://localhost:9202")

In [6]:
def try_exists_index(es_client, index):
    try:
        if es_client.indices.exists(index):
            return True
        return False
            
    except Exception as e:
        # logging.error(e)
        print(e)
        pass

In [7]:
resp = es_obj_s_client.cluster.health()
print(json.dumps(resp, indent=2))

{
  "cluster_name": "docker-cluster",
  "status": "yellow",
  "timed_out": false,
  "number_of_nodes": 1,
  "number_of_data_nodes": 1,
  "active_primary_shards": 186,
  "active_shards": 186,
  "relocating_shards": 0,
  "initializing_shards": 0,
  "unassigned_shards": 186,
  "delayed_unassigned_shards": 0,
  "number_of_pending_tasks": 0,
  "number_of_in_flight_fetch": 0,
  "task_max_waiting_in_queue_millis": 0,
  "active_shards_percent_as_number": 50.0
}


In [8]:
df = pd.DataFrame.from_dict([resp])
df.head(10)

Unnamed: 0,cluster_name,status,timed_out,number_of_nodes,number_of_data_nodes,active_primary_shards,active_shards,relocating_shards,initializing_shards,unassigned_shards,delayed_unassigned_shards,number_of_pending_tasks,number_of_in_flight_fetch,task_max_waiting_in_queue_millis,active_shards_percent_as_number
0,docker-cluster,yellow,False,1,1,186,186,0,0,186,0,0,0,0,50.0


In [9]:
resp = es_obj_t_client.cluster.health()
print(json.dumps(resp, indent=2))

{
  "cluster_name": "docker-elasticsearch",
  "status": "yellow",
  "timed_out": false,
  "number_of_nodes": 1,
  "number_of_data_nodes": 1,
  "active_primary_shards": 84,
  "active_shards": 84,
  "relocating_shards": 0,
  "initializing_shards": 0,
  "unassigned_shards": 57,
  "delayed_unassigned_shards": 0,
  "number_of_pending_tasks": 0,
  "number_of_in_flight_fetch": 0,
  "task_max_waiting_in_queue_millis": 0,
  "active_shards_percent_as_number": 59.57446808510638
}


In [10]:
df = pd.DataFrame.from_dict([resp])
df.head(10)

Unnamed: 0,cluster_name,status,timed_out,number_of_nodes,number_of_data_nodes,active_primary_shards,active_shards,relocating_shards,initializing_shards,unassigned_shards,delayed_unassigned_shards,number_of_pending_tasks,number_of_in_flight_fetch,task_max_waiting_in_queue_millis,active_shards_percent_as_number
0,docker-elasticsearch,yellow,False,1,1,84,84,0,0,57,0,0,0,0,59.574468


In [11]:
''' Verify if all records was uploaded to the target es cluster via the reindxing script '''
elk_reindex_investigation = True
'''' Verify and compare the number of docs between two es clusters regarding to the custom mappings'''
# elk_reindex_investigation = True

"' Verify and compare the number of docs between two es clusters regarding to the custom mappings"

In [12]:
if elk_reindex_investigation:
    query = {
            # "_source": False,
    	'query': {
           'match_all': {}
         }
    }
else:
    query ={
       "query":{
          "bool":{
             "must":[
                {
                   "range":{
                      "ADDTS":{
                         "gte": "%s" % '01/01/2022',
                         "lte": "%s" % '01/01/2024',
                         "format":"MM/dd/yyyy"
                      }
                   }
                },
                {
                   "range":{
                      "EDITTS":{
                         "gte": "%s" % '01/01/2022',
                         "lte": "%s" % '01/01/2024',
                         "format":"MM/dd/yyyy"
                      }
                   }
                }
             ]
          }
       }
}

In [13]:
''' extact a list of indices from the source cluster'''
source_idx_lists = es_obj_s_client.indices.get("*")
# logging.info(json.dumps(source_idx_lists, indent=2))
# logging.info(source_idx_lists)

In [14]:
def compare_docs_source_dest(es_client, es_t_client, source_idx_lists):
    is_not_exist_lists, different_doc, all_doc, all_docs_df = [], [], [], {}
    source_cluter, target_cluter, index_column, index_value, source_cnt, target_cnt = [], [], [], [], [], []
    for each_index in source_idx_lists:
        ''' exclude system indices in the source cluster such as .monitoring-es-7-2024.07.12'''
        if '.' not in each_index:
            res_count_source, res_count_target = 0, 0
            ''' compare each index between source cluster and target cluster'''
            is_exist = try_exists_index(es_t_client, each_index)
            # logging.info(f"validate index [{each_index}] exsits : results is {is_exist}")
            ''' check the number of count'''
            res_count_source = es_client.count(index=each_index, body=query)["count"]
            if is_exist:
                res_count_target = es_t_client.count(index=each_index, body=query)["count"]

            index_column.append(each_index)
            
            if res_count_source > res_count_target:
                differ_dict = {
                                each_index : {
                                            "source_docs" : "%s" % res_count_source,
                                            "target_docs" : "%s" % res_count_target,
                                            "count" : "Differ"
                                            }
                               }
                
                different_doc.append(differ_dict)
                all_doc.append(differ_dict)
                index_value.append(False)
            else:
                 differ_dict = {
                                each_index : {
                                            "source_docs" : "%s" % res_count_source,
                                            "target_docs" : "%s" % res_count_target,
                                            "count" : "Same"
                                            }
                               }
                 all_doc.append(differ_dict)
                 index_value.append(True)

            ''' es cluster '''
            source_cluter.append(es_client)
            target_cluter.append(es_t_client)
            ''' index cnt '''
            source_cnt.append(res_count_source)
            target_cnt.append(res_count_target)
                

            # print(res)
            if not is_exist:
                is_not_exist_lists.append(each_index)

    all_docs_df.update({"Index_Name" : index_column})
    all_docs_df.update({"source_cluster" : str(source_cluter)})
    all_docs_df.update({"Source Count" : source_cnt})
    all_docs_df.update({"target_cluster" : str(target_cluter)})
    all_docs_df.update({"Target Count" : target_cnt})
    all_docs_df.update({"Reindex Completed" : index_value})

    return all_doc, all_docs_df, different_doc, is_not_exist_lists

In [15]:
all_doc, all_docs_df, different_doc, is_not_exist_lists = compare_docs_source_dest(es_obj_s_client, es_obj_t_client, source_idx_lists)
# logging.info(different_doc)
print("all_doc : {}".format(json.dumps(all_doc, indent=2)))

all_doc : [
  {
    "struts2-blank": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "portal": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "struts2-rest-showcase": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "archive_es_del_queue_active_test": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "jbossmq-httpil": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "pages": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "action": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "es_pipeline_upload_test03152018_01": {
      "source_docs": "0",
      "target_docs": "0",
      "count": "Same"
    }
  },
  {
    "test-index1": {
      "source_docs": "0",
      "target_

## Json to Dataframe
#### - Sample for converting json to dataframe

In [16]:
''' sample '''
''' https://docs.kanaries.net/ko/topics/Pandas/pandas-add-column '''
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40]
}
 
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35
3,David,40


In [17]:
# print("all_docs_df : {}".format(json.dumps(all_docs_df, indent=2)))

In [18]:
from rich.markdown import Markdown
MARKDOWN = f"""
Reindex Report 
between source cluster : {es_obj_s_client} and target Cluster : {es_obj_t_client}
"""
Markdown(MARKDOWN)

* Compare the index between two cluster for reindexing

In [19]:
''' Compare the ES indices between source cluster and target cluster '''
df = pd.DataFrame.from_dict(all_docs_df)
df.head(100)

Unnamed: 0,Index_Name,source_cluster,Source Count,target_cluster,Target Count,Reindex Completed
0,struts2-blank,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
1,portal,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
2,struts2-rest-showcase,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
3,archive_es_del_queue_active_test,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
4,jbossmq-httpil,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
5,pages,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
6,action,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
7,es_pipeline_upload_test03152018_01,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
8,test-index1,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,True
9,my-index-01,"[<Elasticsearch([{'host': 'localhost', 'port':...",1,"[<Elasticsearch([{'host': 'localhost', 'port':...",0,False


In [20]:
# print("different_doc : {}".format(json.dumps(different_doc, indent=2)))

In [21]:
# print("is_not_exist_lists : {}".format(json.dumps(is_not_exist_lists, indent=2)))