In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os
import functools

load_dotenv()

True

In [2]:
ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST], timeout=300, max_retries=10, retry_on_timeout=True, verify_certs=False, ssl_show_warn=False)

In [3]:
index_list_names = [
    "loadtest-webrtc-2024-kurento-2p",
    "loadtest-webrtc-2024-kurento-8p",
    "loadtest-webrtc-2024-kurento-3p-10s",
    "loadtest-webrtc-2024-kurento-3p-40s",
    "loadtest-webrtc-2024-pion-2p",
    "loadtest-webrtc-2024-pion-8p",
    "loadtest-webrtc-2024-pion-3p-10s",
    "loadtest-webrtc-2024-pion-3p-40s",
]

test_times = [
    {
        "min": "2024-06-10T15:07:37.000000+02:00",
        "max": "2024-06-10T15:14:54.000000+02:00",
    },
    {
        "min": "2024-05-29T12:11:32.986000+00:00",
        "max": "2024-05-29T12:13:51.929000+00:00"
    },
    {
        "min": "2024-06-03T11:49:59.000000+02:00",
        "max": "2024-06-03T11:54:42.000000+02:00"
    },
    {
        "min": "2024-06-03T11:25:09.000000+02:00",
        "max": "2024-06-03T11:30:17.000000+02:00"
    },
    {
        "min": "2024-06-07T10:54:17.000000+02:00",
        "max": "2024-06-07T11:02:07.000000+02:00"
    },
    {
        "min": "2024-06-07T10:45:53.000000+02:00",
        "max": "2024-06-07T10:50:40.000000+02:00"
    },
    {
        "min": "2024-06-07T15:41:34.000000+02:00",
        "max": "2024-06-07T15:50:02.000000+02:00"
    },
    {
        "min": "2024-06-07T15:57:56.000000+02:00",
        "max": "2024-06-07T16:04:04.000000+02:00"
    }
]

In [4]:
from datetime import datetime
node_types = ["browseremulator", "masternode", "medianode"]
for i, index_name in enumerate(index_list_names):
    current_time = pd.Timestamp.now().isoformat()
    min = test_times[i]["min"]
    max = test_times[i]["max"]
    date_format = "%Y-%m-%dT%H:%M:%S.%f%z"
    min = datetime.strptime(min, date_format)
    max = datetime.strptime(max, date_format)
    for node_type in node_types:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"fields.node_role": node_type}},
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat(),
                                }
                            }
                        },
                    ]
                }
            }
        }
        rel = scan(
            client=es,
            query=query,
            scroll="8h",
            index="metric*",
            raise_on_error=True,
            preserve_order=False,
            clear_scroll=True,
            request_timeout=300,
        )

        # We need only '_source', which has all the fields required.
        # This elimantes the elasticsearch metdata like _id, _type, _index.
        def generate_data():
            i = 0
            for hit in rel:
                i += 1
                print("Data read: ", i, end="\r")
                data = hit["_source"]
                data_to_save = {"@timestamp": data["@timestamp"]}
                if "system" in data:
                    data = data["system"]
                    if "cpu" in data:
                        data_to_save["cpu"] = data["cpu"]["total"]["norm"]["pct"]
                    if "memory" in data:
                        data_to_save["memory"] = data["memory"]["used"]["pct"]
                if "cpu" in data or "memory" in data:
                    yield data_to_save

        print(f"{current_time} - Processing {index_name}-{node_type}")
        data_generator = generate_data()
        # Create a dataframe.
        df = pd.DataFrame(data_generator)
        if not df.empty:
            df = df.groupby("@timestamp", as_index=False).mean()
            df["@timestamp"] = pd.to_datetime(df["@timestamp"])
            df.to_csv(
                f"dfs_final/{index_name}-{node_type}.csv", index=False
            )
            current_time = pd.Timestamp.now().isoformat()
            print(f"{current_time} - Saved {index_name}-{node_type}.csv")

2024-06-11T11:55:40.835984 - Processing loadtest-webrtc-2024-kurento-2p-browseremulator
2024-06-11T11:56:02.237711 - Saved loadtest-webrtc-2024-kurento-2p-browseremulator.csv
2024-06-11T11:56:02.237711 - Processing loadtest-webrtc-2024-kurento-2p-masternode
2024-06-11T11:56:02.812376 - Saved loadtest-webrtc-2024-kurento-2p-masternode.csv
2024-06-11T11:56:02.812376 - Processing loadtest-webrtc-2024-kurento-2p-medianode
2024-06-11T11:56:04.378001 - Saved loadtest-webrtc-2024-kurento-2p-medianode.csv
2024-06-11T11:56:04.378001 - Processing loadtest-webrtc-2024-kurento-8p-browseremulator
2024-06-11T11:56:08.498902 - Saved loadtest-webrtc-2024-kurento-8p-browseremulator.csv
2024-06-11T11:56:08.498902 - Processing loadtest-webrtc-2024-kurento-8p-masternode
2024-06-11T11:56:08.835199 - Saved loadtest-webrtc-2024-kurento-8p-masternode.csv
2024-06-11T11:56:08.835199 - Processing loadtest-webrtc-2024-kurento-8p-medianode
2024-06-11T11:56:09.166222 - Saved loadtest-webrtc-2024-kurento-8p-medianod