In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os

load_dotenv()

ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST])


In [2]:
index_list = [
    "loadtest-webrtc-preliminary-mediasoup-2p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-2p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-5p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-5p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-6p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-6p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-7p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-7p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-8p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-8p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-10p-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-10p-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-3p-10s-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-3p-10s-c5xlarge-final",
    "loadtest-webrtc-preliminary-mediasoup-3p-40s-t3medium-final", "loadtest-webrtc-preliminary-mediasoup-3p-40s-c5xlarge-final"
]


def get_qoe_data_from_elastic(index):
    # query: The elasticsearch query.
    query = {
        "query": {
            "exists": {
                "field": "vmaf"
            }
        }
    }
    # Scan function to get all the data.
    rel = scan(client=es,
               query=query,
               scroll='1m',
               index=index,
               raise_on_error=True,
               preserve_order=False,
               clear_scroll=True)
    # Keep response in a list.
    result = list(rel)
    temp = []
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in result:
        temp.append(hit['_source'])
    # Create a dataframe.
    df = pd.DataFrame(temp)
    return df


In [3]:
df_list = list(map(get_qoe_data_from_elastic, index_list))
for df_user in df_list:
    df_user["@timestamp"] = pd.to_datetime(df_user["@timestamp"])


In [4]:
os.makedirs("dfs", exist_ok=True)
for i, df in enumerate(df_list):
    if not df.empty:
        df.to_csv(f"dfs/{index_list[i]}.csv", index=False)


In [5]:
node_types = ["browseremulator", "masternode", "medianode"]
for i, df_user in enumerate(df_list):
    min = df_user["@timestamp"].min()
    max = df_user["@timestamp"].max()
    for node_type in node_types:
        query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "match": {
                                "fields.node_role": node_type
                            }
                        },
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat()
                                }
                            }
                        }
                    ]
                }
            }
        }
        rel = scan(client=es,
               query=query,
               scroll='1m',
               index="metric*",
               raise_on_error=True,
               preserve_order=False,
               clear_scroll=True)
        # Keep response in a list.
        result = list(rel)
        temp = []
        # We need only '_source', which has all the fields required.
        # This elimantes the elasticsearch metdata like _id, _type, _index.
        for hit in result:
            data = hit['_source']
            data_to_save = {
                "@timestamp": data["@timestamp"]
            }
            if "system" in data:
                data = data['system']
                if "cpu" in data:
                    data_to_save["cpu"] = data['cpu']['total']['norm']['pct']
                if "memory" in data:
                    data_to_save["memory"] = data['memory']['used']['pct']
            if ("cpu" in data) or ("memory" in data):
                temp.append(data_to_save)
        # Create a dataframe.
        df = pd.DataFrame(temp)
        if not df.empty:
            df = df.groupby("@timestamp", as_index=False).mean()
            df["@timestamp"] = pd.to_datetime(df["@timestamp"])
            df.to_csv(f"dfs/{index_list[i]}-{node_type}.csv", index=False)
