In [2]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os
import functools

load_dotenv()

True

In [3]:
ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST], timeout=300, max_retries=10, retry_on_timeout=True, verify_certs=False, ssl_show_warn=False)

In [4]:
index_kurento_names = [
    "loadtest-webrtc-2024-kurento-8p"
]

index_list_names = index_kurento_names



In [7]:
from datetime import datetime
node_types = ["browseremulator", "masternode", "medianode"]
for i, index_name in enumerate(index_list_names):
    current_time = pd.Timestamp.now().isoformat()
    min = "2024-05-29 12:11:32.986000+00:00"
    max = "2024-05-29 12:13:51.929000+00:00"
    date_format = "%Y-%m-%d %H:%M:%S.%f%z"
    min = datetime.strptime(min, date_format)
    max = datetime.strptime(max, date_format)
    for node_type in node_types:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"fields.node_role": node_type}},
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat(),
                                }
                            }
                        },
                    ]
                }
            }
        }
        rel = scan(
            client=es,
            query=query,
            scroll="8h",
            index="metric*",
            raise_on_error=True,
            preserve_order=False,
            clear_scroll=True,
            request_timeout=300,
        )

        # We need only '_source', which has all the fields required.
        # This elimantes the elasticsearch metdata like _id, _type, _index.
        def generate_data():
            i = 0
            for hit in rel:
                i += 1
                print("Data read: ", i, end="\r")
                data = hit["_source"]
                data_to_save = {"@timestamp": data["@timestamp"]}
                if "system" in data:
                    data = data["system"]
                    if "cpu" in data:
                        data_to_save["cpu"] = data["cpu"]["total"]["norm"]["pct"]
                    if "memory" in data:
                        data_to_save["memory"] = data["memory"]["used"]["pct"]
                if "cpu" in data or "memory" in data:
                    yield data_to_save

        print(f"{current_time} - Processing {index_name}-{node_type}")
        data_generator = generate_data()
        # Create a dataframe.
        df = pd.DataFrame(data_generator)
        if not df.empty:
            df = df.groupby("@timestamp", as_index=False).mean()
            df["@timestamp"] = pd.to_datetime(df["@timestamp"])
            df.to_csv(
                f"dfs_final/{index_list_names[i]}-{node_type}.csv", index=False
            )
            current_time = pd.Timestamp.now().isoformat()
            print(f"{current_time} - Saved {index_name}-{node_type}.csv")

2024-05-29T17:05:51.223381 - Processing loadtest-webrtc-2024-kurento-8p-browseremulator
2024-05-29T17:05:55.792793 - Saved loadtest-webrtc-2024-kurento-8p-browseremulator.csv
2024-05-29T17:05:55.792793 - Processing loadtest-webrtc-2024-kurento-8p-masternode
2024-05-29T17:05:56.156112 - Saved loadtest-webrtc-2024-kurento-8p-masternode.csv
2024-05-29T17:05:56.156112 - Processing loadtest-webrtc-2024-kurento-8p-medianode
2024-05-29T17:05:56.502339 - Saved loadtest-webrtc-2024-kurento-8p-medianode.csv
