In [None]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os
import functools

load_dotenv()

In [None]:
ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST], timeout=300, max_retries=10, retry_on_timeout=True, verify_certs=False, ssl_show_warn=False)

In [None]:
index_list_names = [
    "loadtest-webrtc-2024-kurento-2p",
    "loadtest-webrtc-2024-kurento-8p",
    "loadtest-webrtc-2024-kurento-3p-10s",
    "loadtest-webrtc-2024-kurento-3p-40s",
    "loadtest-webrtc-2024-pion-2p",
    "loadtest-webrtc-2024-pion-8p",
    "loadtest-webrtc-2024-pion-3p-10s",
    "loadtest-webrtc-2024-pion-3p-40s",
    "loadtest-webrtc-2024-mediasoup-2p",
    "loadtest-webrtc-2024-mediasoup-8p",
    "loadtest-webrtc-2024-mediasoup-3p-10s",
    "loadtest-webrtc-2024-mediasoup-3p-40s"
]

test_times = [
    {
        "min": "2024-06-18T14:57:24.000000+02:00",
        "max": "2024-06-18T15:06:04.000000+02:00",
    },
    {
        "min": "2024-06-18T15:08:46.000000+02:00",
        "max": "2024-06-18T15:12:31.000000+02:00"
    },
    {
        "min": "2024-06-18T14:31:51.000000+02:00",
        "max": "2024-06-18T14:37:51.000000+02:00"
    },
    {
        "min": "2024-06-18T14:45:59.000000+02:00",
        "max": "2024-06-18T14:51:57.000000+02:00"
    },
    {
        "min": "2024-06-13T15:53:20.000000+02:00",
        "max": "2024-06-13T16:05:33.000000+02:00"
    },
    {
        "min": "2024-08-16T12:13:36.000000+02:00",
        "max": "2024-08-16T12:19:54.000000+02:00"
    },
    {
        "min": "2024-06-13T15:41:35.000000+02:00",
        "max": "2024-06-13T15:49:48.000000+02:00"
    },
    {
        "min": "2024-06-13T14:20:27.000000+02:00",
        "max": "2024-06-13T14:28:30.000000+02:00"
    },
    {
        "min": "2024-08-16T10:14:20.000000+02:00",
        "max": "2024-08-16T10:32:14.000000+02:00"
    },
    {
        "min": "2024-08-14T10:43:54.000000+02:00",
        "max": "2024-08-14T10:47:44.000000+02:00"
    },
    {
        "min": "2024-08-16T10:47:57.000000+02:00",
        "max": "2024-08-16T11:01:41.000000+02:00"
    },
    {
        "min": "2024-08-16T11:09:37.000000+02:00",
        "max": "2024-08-16T11:19:17.000000+02:00"
    },
]

In [None]:
from datetime import datetime
node_types = ["browseremulator", "masternode", "medianode"]
for i, index_name in enumerate(index_list_names):
    current_time = pd.Timestamp.now().isoformat()
    min = test_times[i]["min"]
    max = test_times[i]["max"]
    date_format = "%Y-%m-%dT%H:%M:%S.%f%z"
    min = datetime.strptime(min, date_format)
    max = datetime.strptime(max, date_format)
    for node_type in node_types:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"match": {"fields.node_role": node_type}},
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat(),
                                }
                            }
                        },
                    ]
                }
            }
        }
        rel = scan(
            client=es,
            query=query,
            scroll="8h",
            index="metric*",
            raise_on_error=True,
            preserve_order=False,
            clear_scroll=True,
            request_timeout=300,
        )

        # We need only '_source', which has all the fields required.
        # This elimantes the elasticsearch metdata like _id, _type, _index.
        def generate_data():
            i = 0
            for hit in rel:
                i += 1
                print("Data read: ", i, end="\r")
                data = hit["_source"]
                data_to_save = {"@timestamp": data["@timestamp"]}
                if "system" in data:
                    data = data["system"]
                    if "cpu" in data:
                        data_to_save["cpu"] = data["cpu"]["total"]["norm"]["pct"]
                    if "memory" in data:
                        data_to_save["memory"] = data["memory"]["used"]["pct"]
                if "cpu" in data or "memory" in data:
                    yield data_to_save

        print(f"{current_time} - Processing {index_name}-{node_type}")
        data_generator = generate_data()
        # Create a dataframe.
        df = pd.DataFrame(data_generator)
        if not df.empty:
            df = df.groupby("@timestamp", as_index=False).mean()
            df["@timestamp"] = pd.to_datetime(df["@timestamp"])
            df.to_csv(
                f"dfs_final/{index_name}-{node_type}.csv", index=False
            )
            current_time = pd.Timestamp.now().isoformat()
            print(f"{current_time} - Saved {index_name}-{node_type}.csv")