In [11]:
%pip install -r requirements.txt
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os

load_dotenv()

ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST])


In [12]:
index_list_1 = [
    "loadtest-webrtc-preliminary-2p-t3medium-final",
    "loadtest-webrtc-preliminary-3p-t3medium-final",
    "loadtest-webrtc-preliminary-5p-t3medium-final",
    "loadtest-webrtc-preliminary-6p-t3medium-final",
    "loadtest-webrtc-preliminary-7p-t3medium-final",
    "loadtest-webrtc-preliminary-8p-t3medium-final",
    "loadtest-webrtc-preliminary-10p-t3medium-final",
    "loadtest-webrtc-preliminary-3p-10s-t3medium-final",
    "loadtest-webrtc-preliminary-3p-40s-t3medium-final",
    "loadtest-webrtc-preliminary-2p-c5xlarge-final",
    "loadtest-webrtc-preliminary-3p-c5xlarge-final",
    "loadtest-webrtc-preliminary-5p-c5xlarge-final",
    "loadtest-webrtc-preliminary-6p-c5xlarge-final",
    "loadtest-webrtc-preliminary-7p-c5xlarge-final",
    "loadtest-webrtc-preliminary-8p-c5xlarge-final",
    "loadtest-webrtc-preliminary-10p-c5xlarge-final",
    "loadtest-webrtc-preliminary-3p-10s-c5xlarge-final",
    "loadtest-webrtc-preliminary-3p-40s-c5xlarge-final",
]

index_list_2 = [
    "loadtest-webrtc-preliminary-2p-t3medium-final-2",
    "loadtest-webrtc-preliminary-3p-t3medium-final-2",
    "loadtest-webrtc-preliminary-5p-t3medium-final-2",
    "loadtest-webrtc-preliminary-6p-t3medium-final-2",
    "loadtest-webrtc-preliminary-7p-t3medium-final-2",
    "loadtest-webrtc-preliminary-8p-t3medium-final-2",
    "loadtest-webrtc-preliminary-10p-t3medium-final-2",
    "loadtest-webrtc-preliminary-3p-10s-t3medium-final-2",
    "loadtest-webrtc-preliminary-3p-40s-t3medium-final-2",
    "loadtest-webrtc-preliminary-2p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-3p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-5p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-6p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-7p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-8p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-10p-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-3p-10s-c5xlarge-final-2",
    "loadtest-webrtc-preliminary-3p-40s-c5xlarge-final-2",
]

index_list_3 = [
    "loadtest-webrtc-preliminary-2p-t3medium-final-3",
    "loadtest-webrtc-preliminary-3p-t3medium-final-3",
    "loadtest-webrtc-preliminary-5p-t3medium-final-3",
    "loadtest-webrtc-preliminary-6p-t3medium-final-3",
    "loadtest-webrtc-preliminary-7p-t3medium-final-3",
    "loadtest-webrtc-preliminary-8p-t3medium-final-3",
    "loadtest-webrtc-preliminary-10p-t3medium-final-3",
    "loadtest-webrtc-preliminary-3p-10s-t3medium-final-3",
    "loadtest-webrtc-preliminary-3p-40s-t3medium-final-3",
    "loadtest-webrtc-preliminary-2p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-3p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-5p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-6p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-7p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-8p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-10p-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-3p-10s-c5xlarge-final-3",
    "loadtest-webrtc-preliminary-3p-40s-c5xlarge-final-3",
]

index_list = index_list_1 + index_list_2 + index_list_3

In [13]:
from minio import Minio

client = Minio(
        os.getenv("MINIO_HOST"),
        access_key=os.getenv("MINIO_ACCESS_KEY"),
        secret_key=os.getenv("MINIO_SECRET_KEY"),
    )

found = client.bucket_exists("openvidu-loadtest-preliminary")
if not found:
    raise Exception("Bucket does not exist")

objects_names = []
objects = client.list_objects("openvidu-loadtest-preliminary", recursive=True)
for obj in objects:
    # objects_names.append({
    #     "name": obj.object_name[:-1],
    # })
    splitted = obj.object_name.encode('utf-8').split(b'/')
    test_type = splitted[0].decode('utf-8').split("_")
    publishers = test_type[0].replace("publishers", "") + "p"
    subscribers = ""
    retry = ""
    if (test_type[1].endswith("subscribers")):
        subscribers = "-" + test_type[1].replace("subscribers", "") + "s"
        instance_type = test_type[2]
        if (len(test_type) > 3):
            retry = "-" + test_type[3]
    else:
        instance_type = test_type[1]
        if (len(test_type) > 2):
            retry = "-" + test_type[2]
    
    objects_names.append({
        "index": f"loadtest-webrtc-preliminary-{publishers}{subscribers}-{instance_type}-final{retry}",
        "file": splitted[1].decode('utf-8')
    })

df_minio = pd.DataFrame(objects_names)
df_minio.to_csv(f"dfs/minio_items.csv", index=False)

In [14]:
def get_qoe_data_from_elastic(index):
    # query: The elasticsearch query.
    query = {
        "query": {
            "exists": {
                "field": "vmaf"
            }
        }
    }
    # Scan function to get all the data.
    rel = scan(client=es,
               query=query,
               scroll='1m',
               index=index,
               raise_on_error=True,
               preserve_order=False,
               clear_scroll=True)
    # Keep response in a list.
    result = list(rel)
    temp = []
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in result:
        temp.append(hit['_source'])
    # Create a dataframe.
    df = pd.DataFrame(temp)
    return df

df_list = list(map(get_qoe_data_from_elastic, index_list))

In [15]:
for df_user in df_list:
    if not df_user.empty:
        df_user["@timestamp"] = pd.to_datetime(df_user["@timestamp"])

In [16]:
os.makedirs("dfs", exist_ok=True)
for i, df in enumerate(df_list):
    if not df.empty:
        df.to_csv(f"dfs/{index_list[i]}.csv", index=False)


In [19]:
node_types = ["browseremulator", "masternode", "medianode"]
for i, df_user in enumerate(df_list):
    if not df_user.empty:
        min = df_user["@timestamp"].min()
        max = df_user["@timestamp"].max()
        for node_type in node_types:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "match": {
                                    "fields.node_role": node_type
                                }
                            },
                            {
                                "range": {
                                    "@timestamp": {
                                        "lte": max.isoformat(),
                                        "gte": min.isoformat()
                                    }
                                }
                            }
                        ]
                    }
                }
            }
            rel = scan(client=es,
                query=query,
                scroll='1m',
                index="metric*",
                raise_on_error=True,
                preserve_order=False,
                clear_scroll=True)
            # Keep response in a list.
            result = list(rel)
            temp = []
            # We need only '_source', which has all the fields required.
            # This elimantes the elasticsearch metdata like _id, _type, _index.
            for hit in result:
                data = hit['_source']
                data_to_save = {
                    "@timestamp": data["@timestamp"]
                }
                if "system" in data:
                    data = data['system']
                    if "cpu" in data:
                        data_to_save["cpu"] = data['cpu']['total']['norm']['pct']
                    if "memory" in data:
                        data_to_save["memory"] = data['memory']['used']['pct']
                if ("cpu" in data) or ("memory" in data):
                    temp.append(data_to_save)
            # Create a dataframe.
            df = pd.DataFrame(temp)
            if not df.empty:
                df = df.groupby("@timestamp", as_index=False).mean()
                df["@timestamp"] = pd.to_datetime(df["@timestamp"])
                df.to_csv(f"dfs/{index_list[i]}-{node_type}.csv", index=False)


In [18]:
import json
for i, df_user in enumerate(df_list):
    if not df_user.empty:
        min = df_user["@timestamp"].min()
        max = df_user["@timestamp"].max()
        query = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "exists": {
                                "field": "webrtc_stats"
                            }
                        },
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat()
                                }
                            }
                        }
                    ]
                }
            }
        }
        rel = scan(client=es,
            query=query,
            scroll='1m',
            index=index_list[i],
            raise_on_error=True,
            preserve_order=False,
            clear_scroll=True)
        # Keep response in a list.
        result = list(rel)
        temp_inbound = []
        temp_outbound = []
        # We need only '_source', which has all the fields required.
        # This eliminates the elasticsearch metdata like _id, _type, _index.
        for hit in result:
            data = hit['_source']
            data_to_save = {
                "@timestamp": data["@timestamp"],
                "user_id": json.loads(data["participant_id"])["clientData"].split("_")[2]
            }
            webrtc_stats = data["webrtc_stats"]
            if "inbound" in webrtc_stats:
                for key, value in webrtc_stats["inbound"]["audio"].items():
                    data_to_save[f"audio_{key}"] = value
                for key, value in webrtc_stats["inbound"]["video"].items():
                    data_to_save[f"video_{key}"] = value
                temp_inbound.append(data_to_save)
            if "outbound" in webrtc_stats:
                for key, value in webrtc_stats["outbound"]["audio"].items():
                    data_to_save[f"audio_{key}"] = value
                for key, value in webrtc_stats["outbound"]["video"].items():
                    data_to_save[f"video_{key}"] = value
                temp_outbound.append(data_to_save)

        df_inbound = pd.DataFrame(temp_inbound)
        if not df_inbound.empty:
            df_inbound["@timestamp"] = pd.to_datetime(df_inbound["@timestamp"])
            df_inbound.to_csv(f"dfs/{index_list[i]}-webrtc-stats-outbound.csv", index=False)

        df_outbound = pd.DataFrame(temp_outbound)
        if not df_outbound.empty:
            df_outbound["@timestamp"] = pd.to_datetime(df_outbound["@timestamp"])
            df_outbound.to_csv(f"dfs/{index_list[i]}-webrtc-stats-inbound.csv", index=False)
