In [357]:
''' Jupyter Notebook is an incredibly powerful tool for interactively developing and presenting data science projects.  '''
''' It allows you to run code, display the results, and add explanations, formulas, and charts all in one place.  '''

''' Script that can check for duplicate entries for TASK documents, it would be great if we could do this for all WMx/OMx ES indices.'''
''' Modify your script to check for any process name by the below field(s) to see if any duplicate data exists. '''
print(f"Check the records if index has duplicate recores")

Check the records if index has duplicate recores


In [358]:
#!pip uninstall -y elasticsearch
#!pip install elasticsearch==7.13.0
''' To begin, you need to install pytest and ipytest, a tool designed to run pytest tests directly in Jupyter. Execute the following in a Jupyter cell '''
# !pip install pytest ipytest

In [359]:
# !pip freeze

In [360]:
!pip show elasticsearch
!pip install tqdm

Name: elasticsearch
Version: 7.13.0
Summary: Python client for Elasticsearch
Home-page: https://github.com/elastic/elasticsearch-py
Author: Honza Král, Nick Lang
Author-email: honza.kral@gmail.com, nick@nicklang.com
License: Apache-2.0
Location: /home/biadmin/monitoring/jupyter_notebook/.venv/lib/python3.9/site-packages
Requires: certifi, urllib3
Required-by: 


In [361]:
''' https://blog.jupyter.org/introducing-jupyter-scheduler-f9e82676c388 '''
''' jupyter notebook scheduler '''
!pip install jupyter_scheduler



In [362]:
from elasticsearch import Elasticsearch
import os
import json
import pandas as pd
# import pytest
import ipytest
import datetime
from tqdm.auto import tqdm
import time
from dotenv import load_dotenv

In [363]:
''' To begin, you need to install pytest and ipytest, a tool designed to run pytest tests directly in Jupyter. Execute the following in a Jupyter cell '''
ipytest.autoconfig()

In [364]:
''' pip install python-dotenv'''
load_dotenv() # will search for .env file in local folder and load variables 

True

In [365]:
def get_headers():
    ''' Elasticsearch Header '''
    return {
            'Content-type': 'application/json', 
            'Authorization' : '{}'.format(os.getenv('BASIC_AUTH')),
            'Connection': 'close'
    }

In [366]:
def get_es_instance(host):
    es_client = Elasticsearch(hosts="http://{}".format(host), headers=get_headers(), timeout=60,  verify_certs=False)
    return es_client

# es_obj_s_client = get_es_instance("localhost:9200")

In [367]:
# resp = es_obj_s_client.cluster.health()
# print(json.dumps(resp, indent=2))

In [368]:
es_host_duplicates = {}

In [369]:
def json_value_to_transform_trim(raw_json):
    ''' update value in the form of json format'''
    # print(f"raw_json : {raw_json}")
    def get_recursive_nested_all(d):
        # print(f"get_recursive_nested_all : {d}")
        if isinstance(d, list):
            for i in d:
                get_recursive_nested_all(i)
        elif isinstance(d, dict):
            for k, v in d.items():
                if not isinstance(v, (list, dict)):
                    # print("%%%%", k, v)
                    d[k] = v
                else:
                    # print("####", k, v)
                    get_recursive_nested_all(v)
        return d

    return get_recursive_nested_all(raw_json)

In [370]:
def script_query_to_query_list(script_query) -> list:
    ''' Validate script query when transforming to multiple fields '''
    # script_query = "doc['RECEIPTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value"
    # script_query = "doc['COMPANYKEY.keyword'].value + params.param + doc['COMPANYTYPE.keyword'].value + params.param + doc['CLIENTID.keyword'].value + params.param + doc['SITEID.keyword'].value"
    script_query = script_query.replace('.value + params.param ', '')
    script_query = script_query.replace('].value ', '')
    script_query = script_query.replace('.value', '')
    script_query = script_query.replace('doc', '')
    script_query = script_query.replace("'", '')
    script_query = script_query.replace("]", '')
    script_query = script_query.replace("+", '')
    script_query = script_query.split('[')
    script_query = [str(strs).strip() for strs in script_query if len(strs) >0]
    # print(script_query)
    return script_query

In [371]:
def query_handler(process, script_query, basic_query=True):
    ''' check dsl for any duplicates '''
    '''https://stackoverflow.com/questions/53076349/script-writing-to-get-distinct-value-from-elasticsearch '''
    ''' https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#search-aggregations-bucket-terms-aggregation-script '''
    ''' So you can see in the results as how key are constructed (keys are unique). '''
    '''
    {
      "size": 0,
      "aggs": {
        "duplicates": {
          "terms": {
            "script": {
              #"source": "doc['TASKID'].value",
              "source": "doc['ORDERKEY.keyword'].value + params.param + doc['ORDERLINENO'].value + params.param + doc['SITEID.keyword'].value",
              "lang": "painless",
              "params": {
                "param": ","
              }
            },
            "min_doc_count": 2,
            "size": 10000
          }
        }
      }
    }
    '''
    with_aggs_query = ['wx_inv_trans', 'wx_order', 'wx_order_casemnf', 'wx_receipt', 'om_inventorytransaction', 'om_mbol', 'om_receipt']
    if basic_query:
        ''' basic aggs query to avoid timeout during running the query '''
        if process in with_aggs_query:
            ''' It requires that script_query_to_query_list has two element from the function '''
            aggs_query = script_query_to_query_list(script_query)
            return process, {
                  "size": 0,
                  "aggs": {
                    "duplicates": {
                      "terms": {
                        "field": aggs_query[0],
                        "min_doc_count": 2
                      },
                      "aggs": {
                        "sub": {
                          "terms": {
                            "field": aggs_query[1],
                            "min_doc_count": 2,
                            "size": 10000
                          }
                        },
                        "min_bucket_selector": {
                          "bucket_selector": {
                            "buckets_path": {
                              "count": "sub._bucket_count"
                            },
                            "script": {
                              "source": "params.count > 0"
                            }
                          }
                        }
                      }
                    }
                  }
                }
        else:
            query = {
                  "size": 0,
                  "aggs": {
                    "duplicates": {
                      "terms": {
                        "script": {
                          # "source": "doc['TASKID'].value",
                          "source": script_query,
                          "lang": "painless",
                          "params": {
                            "param": ","
                          }
                        },
                        "min_doc_count": 2,
                        "size" : 10000
                      }
                    }
                  }
                }
    else:
        query = {
          "size": 0,
          "aggs": {
            "duplicates": {
              "terms": {
                "script": {
                  # "source": "doc['TASKID'].value",
                  "source": script_query,
                  "lang": "painless",
                  "params": {
                    "param": ","
                  }
                },
                "min_doc_count": 2,
                "size" : 10000
              },
              "aggs": {
                "first-report": {
                  "top_hits": {
                    "_source": ["_id"], 
                    "size": 10
                  }
                }
              }
            }
          }
        }
    # if process == 'sample':
        # query = {
        #   "size": 0,
        #   "aggs": {
        #     "duplicates": {
        #       "terms": {
        #             "field": "TASKID",
        #             "min_doc_count": 2,
        #             "size" : 10000  
        #         }
        #      }
        #   }
        # }

    # print(json.dumps(query, indent=2))
    # exit(1)
    
    return process, query

In [372]:
def check_duplicates_tasks(env, process_script_query_dict, process):
    ''' check duplicates'''

    try:
        # print('\n', env)
        # print(os.getenv("{}_ES_HOST".format(str(env).upper())))
        es_host = "{}_ES_HOST".format(str(env).upper())
        # dataframe_column.append(es_host)
        dataframe_column.append(str(env).upper())
        dataframe_process.append(process)
    
        if 'wx' in process:
            dataframe_db.append('WMx DB')
        elif 'om' in process:
            dataframe_db.append('OMx DB')
        # print(es_host)
        
        ''' instance '''
        es_obj_s_client = get_es_instance(os.getenv(es_host))
        dataframe_es_client.append(es_obj_s_client)
    
        ''' return index_name and query '''
        index_name, query = query_handler(process, process_script_query_dict)
        # print(json.dumps(query, indent=2))

        ''' Delay time for ES '''
        StartTime = datetime.datetime.now()
        response = es_obj_s_client.search(index=index_name, body=query)
        # print(json.dumps(response, indent=2))
        EndTime = datetime.datetime.now()

        ''' Delay time for searching'''
        Delay_Time = str((EndTime - StartTime).seconds) + '.' + str((EndTime - StartTime).microseconds).zfill(6)[:2]
        dataframe_delay_time.append(float(Delay_Time))
        
        duplicates_list = response['aggregations']['duplicates']['buckets']
        
        # print(f"total duplicates : {json.dumps(len(duplicates_list), indent=2)}")
        # print(json.dumps(duplicates_list, indent=2))
        # lookup = json_value_to_transform_trim(response['aggregations'])
        # print(f"lookup - type(lookup) : {type(lookup)}, lookup : {lookup}")
    
        ''' total docs with len(buckets) + "sum_other_doc_count" '''
        '''
        if len(duplicates_list) > 0:
            AGGS_TOTAL_DOCS = len(duplicates_list) + int(response['aggregations']['duplicates']['sum_other_doc_count'])
        else:
            AGGS_TOTAL_DOCS = len(duplicates_list)
        '''
        AGGS_TOTAL_DOCS = len(duplicates_list)
        # print(env, process, len(duplicates_list), int(response['aggregations']['duplicates']['sum_other_doc_count']))
        # print(env, process, AGGS_TOTAL_DOCS, query, response)
        # es_host_duplicates.update({es_host : AGGS_TOTAL_DOCS})
        dataframe_value.append(AGGS_TOTAL_DOCS)
        # dataframe_query_dsl.append(query)
    
        ''' Call query with inner hits for extracting _ids '''
        lookup_query = {
          "size": 10,
          "query": {
            "bool": {
              "must": [
                {
                  "terms": {
                    "_id": [
                      "sample"
                    ]
                  }
                }
              ]
            }
          }, 
          "aggs": {
            "duplicates": {
              "terms": {
                "script": {
                  "source": process_script_query_dict,
                  # "source": "doc['TASKID'].value",
                  "lang": "painless",
                  "params": {
                    "param": ","
                  }
                },
                "min_doc_count": 2,
                "size": 10000
              },
              "aggs": {
                "first-report": {
                  "top_hits": {
                    "_source": ["_id"], 
                    "size": 10
                  }
                }
              }
            }
          }
        }
        
        if AGGS_TOTAL_DOCS > 0:
            index_name, query = query_handler(process, process_script_query_dict, basic_query=False)
            # print(query)
            '''
            response = es_obj_s_client.search(index=index_name, body=query)
            _ids_list = []
            for each_bucket in response['aggregations']['duplicates']['buckets']:
                for row_dict in each_bucket['first-report']['hits']['hits']:
                    # print(row_dict.get('_id'))
                    _ids_list.append(row_dict.get('_id'))
            # print('","'.join(_ids_list))
            lookup_query['query']['bool']['must'][0]['terms']['_id'] = ['","'.join(_ids_list)]
            lookup_query = str(lookup_query).replace("'", '"')
            lookup_query = str(lookup_query).replace('doc["', "doc['")
            lookup_query = str(lookup_query).replace('"].value', "'].value")
            dataframe_ids.append(lookup_query)
            '''
            # dataframe_ids.append('","'.join(_ids_list))
            dataframe_ids.append(query)
            query_env_dict[str(env).upper()].update({process : query})
        else:
            dataframe_ids.append("")
                
        # ''' save raw_data '''
        # raw_env_dict[env].update({
        #     process : duplicates_list
        # })
    
        if str(env).upper() not in raw_env_dict.keys():
            raw_env_dict.update({str(env).upper() : len(duplicates_list)})
        else:
            raw_env_dict.update({str(env).upper() : int(raw_env_dict[str(env).upper()]) + len(duplicates_list)})
    
    except Exception as e:
        print(process, e)
        dataframe_delay_time.append(float(-1))
        dataframe_value.append(-1)
        dataframe_ids.append("")


In [373]:

from collections import defaultdict

env_list = ['dev']
# env_list = ['dev', 'dev']
# env_list = ['qa1','qa2']
# env_list = ['qa4','qa5','qa6']
# env_list = ['qa1','qa2','qa4','qa5','qa6','qa9','qa11','qa13','qa14','qa15','qa16','qa17','qa18','qa20','qa25']
# env_list = ['prod1','prod2','prod3','prod4','prod6','prod7','prod8','prod9','prod10','prod12','prod13','prod14','prod16','prod17','prod18','prod19','prod20']
# env_list = ['prod1']
# env_list = ['prod1','prod2']
# env_list = ['prod1','prod2','prod3','prod4','prod6','prod7']
# env_list = ['prod8','prod9','prod10','prod12','prod13','prod14']
# env_list = ['prod16','prod17','prod18','prod19','prod20']
dataframe_dict = {}
raw_env_dict = {}
# raw_env_dict = defaultdict(dict)
dataframe_column, dataframe_process, dataframe_es_client, dataframe_value, dataframe_db, dataframe_query_dsl, dataframe_ids = [], [], [], [], [], [], []
dataframe_delay_time = []
dataframe_df_result = []
query_env_dict = defaultdict(dict)

''' Script that can check for duplicate entries for TASK documents, it would be great if we could do this for all WMx/OMx ES indices.'''
''' Modify your script to check for any process name by the below field(s) to see if any duplicate data exists. '''

''' Create query dsl with unique fields '''
''' take long time to extract the duplicate entries '''
# 'wx_inv_trans' : "doc['INVTRANSKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",

process_script_query_dict = {
    'wx_adjustment' : "doc['ADJUSTMENTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_asn' : "doc['ASNKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_asndtl' : "doc['ASNKEY.keyword'].value + params.param + doc['ASNLINENO'].value + params.param + doc['SITEID.keyword'].value",
    'wx_company' : "doc['COMPANYKEY.keyword'].value + params.param + doc['COMPANYTYPE.keyword'].value + params.param + doc['CLIENTID.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_container_ship' : "doc['CONTAINERKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_dock_ack' : "doc['DOCKACKKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_inv_case' : "doc['INVCASEID.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_inv_hold' : "doc['INVHOLDID'].value",
    'wx_inv_holdtrans' : "doc['INVHOLDTRANSID'].value",
    'wx_inv_hold_hist' : "doc['INVHOLDID'].value",
    'wx_inv_sn' : "doc['INVSN.keyword'].value + params.param + doc['SKU.keyword'].value + params.param + doc['CLIENTID.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_inv_sn_trans' : "doc['INVSNTRANSID'].value",
    # 'wx_inv_trans' : "doc['INVTRANSKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_loc' : "doc['LOC.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_mbol' : "doc['MBOLKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_nci' : "doc['NCIKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_order' : "doc['ORDERKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_orderdtl' : "doc['ORDERKEY.keyword'].value + params.param + doc['ORDERLINENO'].value + params.param + doc['SITEID.keyword'].value",
    'wx_order_casemnf' : "doc['CASEID.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_po' : "doc['POKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_podtl' : "doc['POKEY.keyword'].value + params.param + doc['POLINENO'].value + params.param + doc['SITEID.keyword'].value",
    'wx_receipt' : "doc['RECEIPTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_sku' : "doc['SKU.keyword'].value + params.param + doc['CLIENTID.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'wx_task' : "doc['TASKID'].value",
    'wx_wave' : "doc['WAVEKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'om_appointment' : "doc['CRN'].value",
    'om_asn' : "doc['CRN'].value",
    'om_controlorder' : "doc['CRN'].value",
    # 'om_inventorytransaction' : "doc['SITE_ID.keyword'].value + params.param + doc['INVTRANSKEY.keyword'].value",
    'om_inv_balance' : "doc['CLIENT_ID.keyword'].value + params.param + doc['SITEID.keyword'].value + params.param + doc['SKU.keyword'].value + params.param + doc['ACCOUNT.keyword'].value",
    'om_mbol' : "doc['MBOLKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'om_nci' : "doc['CRN'].value",
    'om_order' : "doc['CRN'].value",
    # 'om_order_original_dkr' : "doc['CRN'].value",
    'om_organization' : "doc['ORGANIZATION_ID.keyword'].value",
    # 'om_org_client' : "doc['ORGANIZATION_ID.keyword'].value",
    'om_po' : "doc['CRN'].value",
    'om_receipt' : "doc['RECEIPTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value",
    'om_sku' : "doc['CRN'].value",
    'om_whorder' : "doc['CRN'].value",
}

# print('Progressing..')
pbar = tqdm(env_list)
# for env in env_list:
i = 0
for env in pbar:
    # print(f"Progressing for {env} ..")
    pbar.set_description(f"Processing {env}")
    # for each_index in process_script_query_dict.keys():
    ''' Is there any option, how to show only current j progress bar during the run? You can use leave param when create progress bar.  '''
    pbar_process = tqdm(list(process_script_query_dict.keys()), leave=bool(i == len(env_list)-1))
    for each_index in pbar_process:
        # print(f"Progressing for {each_index} ..")
        pbar_process.set_description(f"Processing {env} : {each_index}")
        check_duplicates_tasks(env, process_script_query_dict.get(each_index), each_index)
        time.sleep(1)
    i += 1
# print('\n Duplicate records')

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

In [374]:
''' update dict for dataframe '''
dataframe_dict.update({"ENV" : dataframe_column})
dataframe_dict.update({"ES URL" : dataframe_es_client})
dataframe_dict.update({"DB" : dataframe_db})
dataframe_dict.update({"Query_Time" : dataframe_delay_time})
dataframe_dict.update({"Process_Name" : dataframe_process})
dataframe_dict.update({"Duplicates_Count" : dataframe_value})
dataframe_dict.update({"Query DSL for lookup Ids" : dataframe_ids})
# dataframe_dict.update({'Query DSL' : dataframe_query_dsl})

In [375]:
print(type(dataframe_dict))
# print(dataframe_dict)
# print(json.dumps(dataframe_dict, indent=2))

<class 'dict'>


In [376]:
df = pd.DataFrame(dataframe_dict)
# display(df)

In [377]:
''' print out raw data '''
# print(json.dumps(raw_env_dict, indent=2))

In [378]:
# writing to Excel
df.to_csv("duplicate.csv")
print("Export csv files successfully..")

Export csv files successfully..


In [379]:
''' filter if any duplicate data exist in df dataframe '''
# display(df.filter(items=['ES_Cluster', 'Process_Name'], axis=1))

In [380]:
''' env list if any duplicate data exists '''
# print(raw_env_dict)
print('# Check for any process name by the below field(s) to see if any duplicate data exists')
df_dict_result = {}
df_dict_result.update({"ENV" : list(raw_env_dict.keys())})
df_dict_result.update({"Duplicate_Count" : list(raw_env_dict.values())})
df_result = pd.DataFrame(df_dict_result)
display(df_result)

# Check for any process name by the below field(s) to see if any duplicate data exists


Unnamed: 0,ENV,Duplicate_Count
0,DEV,837


In [381]:
''' check if any duplicate data exist in df dataframe '''
df = df[(df['Duplicates_Count'] > 0) | (df['Query_Time'] > 1)]
# display(df)

In [382]:
''' print out query_dsl if any duplicate data exist on process name'''
print(json.dumps(query_env_dict, indent=2))

{
  "DEV": {
    "wx_task": {
      "size": 0,
      "aggs": {
        "duplicates": {
          "terms": {
            "script": {
              "source": "doc['TASKID'].value",
              "lang": "painless",
              "params": {
                "param": ","
              }
            },
            "min_doc_count": 2,
            "size": 10000
          },
          "aggs": {
            "first-report": {
              "top_hits": {
                "_source": [
                  "_id"
                ],
                "size": 10
              }
            }
          }
        }
      }
    }
  }
}


In [383]:
from Mail import send_mail, mail_check

In [384]:
''' mail report '''
data = df_dict_result
# print(data)

message_header, message_body = [], []
message = ""

for k in data.keys():
    message_header.append(k)

# message ="<b>"
message +="\t".join(message_header)
# message +="</b><BR/>"
message +="<BR/>"

for k, v in data.items():
    for i in range(0, len(v)):
        message_body.append(str(data.get('ENV')[i]) + "\t" + str(data.get('Duplicate_Count')[i]))
    break
message +="<BR/>".join(message_body)
message +="<BR/>"

''' include query if query is in query_env_dict '''
if query_env_dict:
    message +="<BR/>"
    message +="<b> - Lookup Query for the duplicates process</b> (Below query is a detailed query for process with duplicates. Use below query to remove duplicates)<BR/>"
    # message += "{}".format(json.dumps(query_env_dict, indent=2).replace("\n","<br/>").replace("\\s","&nbsp;"))
    # message += "{}".format(json.dumps(query_env_dict, indent=1).replace("\n","<br/>").replace(" ","&nbsp;"))
    message += "{}".format(json.dumps(query_env_dict, indent=1).replace("\n","<br/>"))
    message +="<BR/>"

# message = message.replace("&n bsp; "," ")
# print(message)
user_mail_list = os.environ['MAIL_USERLIST']
cc_mail_list = os.environ['MAIL_CC']
# print(f"os.getenv : {user_mail_list}")

In [385]:
''' send mail for report '''
send_mail(body=message, to=user_mail_list, cc=cc_mail_list)

send_mail..


In [386]:
''' ******* Test code ********* '''
def test_pytest_func():
    assert 42 == 42

In [387]:
def test_query_validate():
    ''' Validate the basic query '''
    index_name, query = query_handler('test', "doc['TASKID'].value")
    # index_name, query = query_handler('test', ["ADJUSTMENTKEY.keyword", "SITEID.keyword"])
    # print(query)
    assert index_name == 'test'
    assert type(query) == dict
    '''
    assert query == {
      "size": 0,
      "aggs": {
        "duplicates": {
          "terms": {
            "field": "ADJUSTMENTKEY.keyword",
            "min_doc_count": 2
          },
          "aggs": {
            "sub": {
              "terms": {
                "field": "SITEID.keyword",
                "min_doc_count": 2,
                "size": 10000
              }
            },
            "min_bucket_selector": {
              "bucket_selector": {
                "buckets_path": {
                  "count": "sub._bucket_count"
                },
                "script": {
                  "source": "params.count > 0"
                }
              }
            }
          }
        }
      }
    }
    '''
    assert query == {
      "size": 0,
      "aggs": {
        "duplicates": {
          "terms": {
            "script": {
              "source": "doc['TASKID'].value",
              # "source": "doc['ORDERKEY.keyword'].value + params.param + doc['ORDERLINENO'].value + params.param + doc['SITEID.keyword'].value",
              "lang": "painless",
              "params": {
                "param": ","
              }
            },
            "min_doc_count": 2,
            "size": 10000
          }
        }
      }
    }
    
    ''' Validate the extract ids query '''
    index_name, query = query_handler('test', "doc['TASKID'].value", basic_query=False)
    assert query == {
      "size": 0,
      "aggs": {
        "duplicates": {
          "terms": {
            "script": {
              "source": "doc['TASKID'].value",
              # "source": "doc['ORDERKEY.keyword'].value + params.param + doc['ORDERLINENO'].value + params.param + doc['SITEID.keyword'].value",
              "lang": "painless",
              "params": {
                "param": ","
              }
            },
            "min_doc_count": 2,
            "size": 10000
          },
          "aggs": {
              "first-report": {
                  "top_hits": {
                    "_source": ["_id"], 
                    "size": 10
                  }
               }
           }
        }
      }
    }
    
    

In [388]:
def test_script_query_to_query_list():
    ''' Validate script query when transforming to multiple fields '''
    
    ''' Case #1 '''
    script_query = script_query_to_query_list("doc['RECEIPTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value")
    assert script_query == ['RECEIPTKEY.keyword', 'SITEID.keyword']

    ''' Case #2 '''
    script_query = script_query_to_query_list("doc['COMPANYKEY.keyword'].value + params.param + doc['COMPANYTYPE.keyword'].value + params.param + doc['CLIENTID.keyword'].value + params.param + doc['SITEID.keyword'].value")
    assert script_query == ['COMPANYKEY.keyword', 'COMPANYTYPE.keyword', 'CLIENTID.keyword', 'SITEID.keyword']

    ''' Case #3 '''
    script_query = script_query_to_query_list("doc['RECEIPTKEY.keyword'].value + params.param + doc['SITEID.keyword'].value")
    assert script_query == ['RECEIPTKEY.keyword', 'SITEID.keyword']


In [389]:
''' Execute the tests using ipytest.run(). You can pass command-line arguments to control test behavior: '''
''' The pytest framework makes it easy to write small, readable tests, and can scale to support complex functional testing for applications and libraries. '''
ipytest.run('-vv')

platform linux -- Python 3.9.0, pytest-8.3.4, pluggy-1.5.0 -- /home/biadmin/monitoring/jupyter_notebook/.venv/bin/python3.9
cachedir: .pytest_cache
rootdir: /home/biadmin/monitoring/jupyter_notebook
plugins: anyio-4.5.0
[1mcollecting ... [0mcollected 3 items

t_c370b69af1ef458185f86687e7a5ee4a.py::test_pytest_func [32mPASSED[0m[32m                               [ 33%][0m
t_c370b69af1ef458185f86687e7a5ee4a.py::test_query_validate [32mPASSED[0m[32m                            [ 66%][0m
t_c370b69af1ef458185f86687e7a5ee4a.py::test_script_query_to_query_list [32mPASSED[0m[32m                [100%][0m



<ExitCode.OK: 0>

## 