In [19]:
import pandas as pd

In [1]:
# Verify HTTP response with a GET request

import requests

requests.get("http://192.168.195.78:9200").json()

{'name': '79e8d2a7d7e7',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'ydOQxjPIS3G3vsNk9di2Qw',
 'version': {'number': '7.6.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '7f634e9f44834fbc12724506cc1da681b0c3b1e3',
  'build_date': '2020-02-06T00:09:00.449973Z',
  'build_snapshot': False,
  'lucene_version': '8.4.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [60]:
class DatasetBuilder:
    
    def __init__(self, client, index):
        self.client = client
        self.index = index
    
    def __build_serialized_dataset(self, delta_time = 1000, start_ts = 1652979000000):
        
        index = self.index
        window_size = 100000
    
        query = {
            "range": {
                "startTimeMillis" : {
                    "gte" : 1652979531000,
                    "lte" : 1652983145000
                }
            }
        }

        resp = self.client.search(
                index = index,
                query = query,
                size = window_size,
                scroll = '30s' # length of time to keep search context
            )


        # keep track of pass scroll _id
        old_scroll_id = resp['_scroll_id']
        
        values_to_add = []
        
        clients = 0
        servers = 0

        print("old scroll id", old_scroll_id)
        
        doc_count = 0
        
        while len(resp['hits']['hits']):
            # make a request using the Scroll API
            
            doc_count += 1
            
            print ("\nresponse for index:", index)
            print ("_scroll_id:", resp['_scroll_id'])
            print ('response["hits"][ "total"]["value"]:', resp["hits"]["total"]["value"])
            print("Actual counter: ", doc_count, round(doc_count * window_size / resp["hits"]["total"]["value"] * 100), "%")
            
            
            # print("Before scroll")
            resp = self.client.scroll(
                scroll_id = old_scroll_id,
                scroll = '30s' # length of time to keep search context
            )
            
            # print("after scroll")

            # check if there's a new scroll ID
            if old_scroll_id != resp['_scroll_id']:
                print ("NEW SCROLL ID:", resp['_scroll_id'])

            # keep track of pass scroll _id
            old_scroll_id = resp['_scroll_id']
        
            

            # sts = 1652979531
            # ets = 1652983145

            for i, hit in enumerate(resp['hits']['hits']):
                data_source = hit['_source']
                if not "kind" in data_source.keys():
                    print("strage response", json.dumps(hit['_source'], indent=4))
                else:
                    kind = data_source['kind']

                    if kind == "SERVER":

                        servers += 1

                        # Consider the span only if is kind SERVER

                        spanID = data_source['spanID']
                        traceID = data_source['traceID']
                        duration = data_source['duration']
                        process = data_source['process']['serviceName']
                        operation = data_source['operationName']

                        startTimeMillis = hit['_source']['startTimeMillis']

                        frame = int((startTimeMillis - start_ts) / delta_time)


                        if spanID == traceID:
                            tp = "entry"
                        else:
                            tp = "end"

                        dict_vals = {
                            "time_frame": frame,
                            "type": tp,
                            "duration": duration,
                            "service": process + ":" + operation
                        }    

                        values_to_add.append(dict_vals)
                        # break

                    else:
                        clients += 1
                
                
                
        print("=======================")
        print("------- STATS: --------")
        print("Total spans: ", resp["hits"]["total"]["value"])
        print("SERVER SPANS", servers)
        print("CLIENT SPANS", clients)

        dataset = pd.DataFrame.from_records(values_to_add)
        return dataset
    
    def __build_final_dataset(self, serialized_dataset):
        values_to_add = []

        for i, frame in enumerate(serialized_dataset['time_frame'].unique()):
            # print(frame)
            frame_values = {
                "time_frame": frame
            }

            ds_subset = serialized_dataset[serialized_dataset["time_frame"] == frame]

            # print(ds_subset)

            services = (ds_subset["service"] + "___" + ds_subset["type"]).unique()

            # print(services)

            for service in services:
                service_name, tp = service.split("___")
                if tp == "entry":
                    subdf = ds_subset.loc[(ds_subset["service"] == service_name) & (ds_subset["type"] == tp)]
                    frame_values["ENTRYPOINT_AVG_EXTIME_" + service_name] = sum(list(subdf["duration"])) / len(subdf)
                    frame_values["ENTRYPOINT_N_INVOC_API_" + service_name] = sum(list(subdf["duration"])) / len(subdf)
                elif tp == "end":
                    subdf = ds_subset.loc[(ds_subset["service"] == service_name) & (ds_subset["type"] == tp)]
                    frame_values["ENDPOINT_N_INVOC_API_" + service_name] = len(subdf)

            # print(frame_values)

            values_to_add.append(frame_values)


        final_dataset = pd.DataFrame.from_records(values_to_add)
        
        final_dataset = final_dataset.fillna(0)
        return final_dataset
    
    def build_dataset(self, delta_time = 100, start_ts = 1652979000000):
        _serialized_ds = self.__build_serialized_dataset(delta_time = delta_time, start_ts = start_ts)
        return self.__build_final_dataset(_serialized_ds)

In [61]:
# import the Elasticsearch client library
from elasticsearch import Elasticsearch, exceptions

# import JSON and time
import json, time

# create a timestamp using the time() method
start_time = time.time()

# declare globals for the Elasticsearch client host
DOMAIN = "http://192.168.195.78"
PORT = 9200

# concatenate a string for the client's host paramater
host = str(DOMAIN) + ":" + str(PORT)

# declare an instance of the Elasticsearch library
client = Elasticsearch(host)

try:
    # use the JSON library's dump() method for indentation
    info = json.dumps(client.info(), indent=4)

    # pass client object to info() method
    print ("Elasticsearch client info():", info)

except exceptions.ConnectionError as err:

    # print ConnectionError for Elasticsearch
    print ("\nElasticsearch info() ERROR:", err)
    print ("\nThe client host:", host, "is invalid or cluster is not running")

    # change the client's value to 'None' if ConnectionError
    client = None

# valid client instance for Elasticsearch
if client != None:
    
    index = "jaeger-span-2022-05-19"
    
    client.indices.put_settings(index=index,
                        body= {"index" : {
                                "max_result_window" : 500000
                              }})

    db = DatasetBuilder(client = client, index = index)

    df = db.build_dataset()

    df

Elasticsearch client info(): {
    "name": "79e8d2a7d7e7",
    "cluster_name": "docker-cluster",
    "cluster_uuid": "ydOQxjPIS3G3vsNk9di2Qw",
    "version": {
        "number": "7.6.0",
        "build_flavor": "default",
        "build_type": "docker",
        "build_hash": "7f634e9f44834fbc12724506cc1da681b0c3b1e3",
        "build_date": "2020-02-06T00:09:00.449973Z",
        "build_snapshot": false,
        "lucene_version": "8.4.0",
        "minimum_wire_compatibility_version": "6.8.0",
        "minimum_index_compatibility_version": "6.0.0-beta1"
    },
    "tagline": "You Know, for Search"
}
old scroll id DXF1ZXJ5QW5kRmV0Y2gBAAAAAAABfB8WRDJVZ1k0cThRSXFabkEwcnJqWVBFZw==

response for index: jaeger-span-2022-05-19
_scroll_id: DXF1ZXJ5QW5kRmV0Y2gBAAAAAAABfB8WRDJVZ1k0cThRSXFabkEwcnJqWVBFZw==
response["hits"][ "total"]["value"]: 2874425
Actual counter:  1 3 %

response for index: jaeger-span-2022-05-19
_scroll_id: DXF1ZXJ5QW5kRmV0Y2gBAAAAAAABfB8WRDJVZ1k0cThRSXFabkEwcnJqWVBFZw==
respons

------- STATS: --------
Total spans:  2874425
SERVER SPANS 1408475
CLIENT SPANS 1365923


In [62]:
df

Unnamed: 0,time_frame,ENDPOINT_N_INVOC_API_ts-travel-service:queryInfo,ENDPOINT_N_INVOC_API_ts-travel-service:getRouteByTripId,ENDPOINT_N_INVOC_API_ts-travel-service:getTrainTypeByTripId,ENDPOINT_N_INVOC_API_ts-config-service:retrieve,ENTRYPOINT_AVG_EXTIME_ts-consign-service:findByOrderId,ENTRYPOINT_N_INVOC_API_ts-consign-service:findByOrderId,ENTRYPOINT_AVG_EXTIME_ts-travel-plan-service:getByCheapest,ENTRYPOINT_N_INVOC_API_ts-travel-plan-service:getByCheapest,ENDPOINT_N_INVOC_API_ts-ticketinfo-service:queryForStationId,...,ENTRYPOINT_AVG_EXTIME_ts-auth-service:getToken,ENTRYPOINT_N_INVOC_API_ts-auth-service:getToken,ENDPOINT_N_INVOC_API_ts-order-service:securityInfoCheck,ENTRYPOINT_AVG_EXTIME_ts-assurance-service:getAllAssuranceType,ENTRYPOINT_N_INVOC_API_ts-assurance-service:getAllAssuranceType,ENTRYPOINT_AVG_EXTIME_ts-travel-service:queryInfo,ENTRYPOINT_N_INVOC_API_ts-travel-service:queryInfo,ENTRYPOINT_AVG_EXTIME_ts-order-service:POST,ENTRYPOINT_N_INVOC_API_ts-order-service:POST,ENDPOINT_N_INVOC_API_ts-order-service:GET
0,5702,2.0,6.0,3.0,9.0,10746.0,10746.0,279672.0,279672.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5701,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5703,0.0,6.0,6.0,6.0,6008.5,6008.5,0.0,0.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5694,0.0,0.0,0.0,2.0,0.0,0.0,212794.0,212794.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5695,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10856,41297,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1748.0,1748.0,0.0,0.0,0.0,0.0,0.0
10857,41296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,29062.0,29062.0,0.0,0.0,0.0
10858,41305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,108552.0,108552.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10859,41306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
df.to_csv("avv_dataset.csv")