In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from dotenv import load_dotenv
import os

load_dotenv()



In [None]:
ELK_HOST = os.getenv("ELK_HOST")

es = Elasticsearch(hosts=[ELK_HOST], timeout=300, max_retries=10, retry_on_timeout=True, verify_certs=False)

In [None]:
index_lk = [
    "loadtest-webrtc-stats-1705318060009",  # 2p retry
    "loadtest-webrtc-stats-1704977399068",  # 8p retry
    "loadtest-webrtc-stats-1704970758533",  # 3p 10 s retry
    "loadtest-webrtc-stats-1704987214896",  # 3p 40 s retry
]

index_list = index_lk

index_lk_names = [
    "loadtest-webrtc-final-livekit-2p-t3medium-retry",
    "loadtest-webrtc-final-livekit-8p-t3medium-retry",
    "loadtest-webrtc-final-livekit-3p-10s-t3medium-retry",
    "loadtest-webrtc-final-livekit-3p-40s-t3medium-retry",
]

index_list_names = index_lk_names

In [None]:
start_end_times = pd.read_json("dfs_final/start-end-times.json", orient="index")
start_end_times["from"] = pd.to_datetime(
    start_end_times["from"], format="ISO8601", utc=True
)
start_end_times["to"] = pd.to_datetime(
    start_end_times["to"], format="ISO8601", utc=True
)

def get_max_time(df_index, index):
    tmp_serie = pd.Series(
        [df_index["@timestamp"].max(), start_end_times.loc[index, "to"]]
    )
    return tmp_serie.max()


def get_min_time(df_index, index):
    tmp_serie = pd.Series(
        [df_index["@timestamp"].min(), start_end_times.loc[index, "from"]]
    )
    return tmp_serie.min()

In [None]:
def generate_qoe_data_from_elastic(index):
    # query: The elasticsearch query.
    query = {"query": {"exists": {"field": "vmaf"}}}
    # Scan function to get all the data.
    rel = scan(
        client=es,
        query=query,
        scroll="1m",
        index=index,
        raise_on_error=True,
        preserve_order=False,
        clear_scroll=True,
    )
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in rel:
        yield hit["_source"]


df_generators = (
    pd.DataFrame(generate_qoe_data_from_elastic(index)) for index in index_list
)

df_list = list(df_generators)

In [None]:
for df_user in df_list:
    if not df_user.empty:
        df_user["@timestamp"] = pd.to_datetime(df_user["@timestamp"])

In [None]:
def generate_user_data_from_elastic(index):
    # query: The elasticsearch query.
    query = {"query": {"exists": {"field": "new_participant_id"}}}
    # Scan function to get all the data.
    rel = scan(
        client=es,
        query=query,
        scroll="1m",
        index=index,
        raise_on_error=True,
        preserve_order=False,
        clear_scroll=True,
    )
    # We need only '_source', which has all the fields required.
    # This elimantes the elasticsearch metdata like _id, _type, _index.
    for hit in rel:
        source = hit["_source"]
        data_to_save = {
            "@timestamp": source["@timestamp"],
            "participant": source["new_participant_id"],
            "session": source["new_participant_session"],
        }
        yield data_to_save


for i, index in enumerate(index_list):
    df_generators = generate_user_data_from_elastic(index)

    df_users = pd.DataFrame(df_generators)
    df_users["@timestamp"] = pd.to_datetime(df_users["@timestamp"])
    df_users = df_users.sort_values(by="@timestamp")
    df_users.to_csv(f"dfs_final/{index_list_names[i]}-user-join.csv", index=False)
    df = df_list[i].sort_values(by="@timestamp")
    df['user_count'] = [len(df_users[df_users['@timestamp'] <= ts]) for ts in df['@timestamp']]
    df_list[i] = df

In [None]:
os.makedirs("dfs_final", exist_ok=True)
for i, df in enumerate(df_list):
    if not df.empty:
        df.to_csv(f"dfs_final/{index_list_names[i]}.csv", index=False)

In [None]:
# Warning: this process can be long (hours).
node_types = ["browseremulator", "masternode", "medianode"]
for i, df_user in enumerate(df_list):
    if not df_user.empty:
        index_name = index_list_names[i]
        current_time = pd.Timestamp.now().isoformat()
        min = get_min_time(df_user, index_name)
        max = get_max_time(df_user, index_name)
        for node_type in node_types:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {"match": {"fields.node_role": node_type}},
                            {
                                "range": {
                                    "@timestamp": {
                                        "lte": max.isoformat(),
                                        "gte": min.isoformat(),
                                    }
                                }
                            },
                        ]
                    }
                }
            }
            rel = scan(
                client=es,
                query=query,
                scroll="8h",
                index="metric*",
                raise_on_error=True,
                preserve_order=False,
                clear_scroll=True,
                request_timeout=300,
            )

            # We need only '_source', which has all the fields required.
            # This elimantes the elasticsearch metdata like _id, _type, _index.
            def generate_data():
                i = 0
                for hit in rel:
                    i += 1
                    print("Data read: ", i, end="\r")
                    data = hit["_source"]
                    data_to_save = {"@timestamp": data["@timestamp"]}
                    if "system" in data:
                        data = data["system"]
                        if "cpu" in data:
                            data_to_save["cpu"] = data["cpu"]["total"]["norm"]["pct"]
                        if "memory" in data:
                            data_to_save["memory"] = data["memory"]["used"]["pct"]
                    if "cpu" in data or "memory" in data:
                        yield data_to_save

            print(f"{current_time} - Processing {index_name}-{node_type}")
            data_generator = generate_data()
            # Create a dataframe.
            df = pd.DataFrame(data_generator)
            if not df.empty:
                df = df.groupby("@timestamp", as_index=False).mean()
                df["@timestamp"] = pd.to_datetime(df["@timestamp"])
                df.to_csv(
                    f"dfs_final/{index_list_names[i]}-{node_type}.csv", index=False
                )
                current_time = pd.Timestamp.now().isoformat()
                print(f"{current_time} - Saved {index_name}-{node_type}.csv")

In [None]:
import json

for i, df_user in enumerate(df_list):
    index_name = index_list_names[i]
    current_time = pd.Timestamp.now().isoformat()
    print(f"{current_time} - Processing {index_name}")
    if not df_user.empty:
        min = get_min_time(df_user, index_list_names[i])
        max = get_max_time(df_user, index_list_names[i])
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"exists": {"field": "webrtc_stats"}},
                        {
                            "range": {
                                "@timestamp": {
                                    "lte": max.isoformat(),
                                    "gte": min.isoformat(),
                                }
                            }
                        },
                    ]
                }
            }
        }
        rel = scan(
            client=es,
            query=query,
            scroll="1m",
            index=index_list[i],
            raise_on_error=True,
            preserve_order=False,
            clear_scroll=True,
        )
        # Keep response in a list.
        result = list(rel)
        temp_inbound = []
        temp_outbound = []
        # We need only '_source', which has all the fields required.
        # This eliminates the elasticsearch metdata like _id, _type, _index.
        for hit in result:
            data = hit["_source"]
            data_to_save = {
                "@timestamp": data["@timestamp"],
                "user_id": data["participant_id"],
                "session_id": data["session_id"]
            }
            webrtc_stats = data["webrtc_stats"]
            if "inbound" in webrtc_stats:
                if "audio" in webrtc_stats["inbound"]:
                    for key, value in webrtc_stats["inbound"]["audio"].items():
                        data_to_save[f"audio_{key}"] = value
                if "video" in webrtc_stats["inbound"]:
                    for key, value in webrtc_stats["inbound"]["video"].items():
                        data_to_save[f"video_{key}"] = value
                temp_inbound.append(data_to_save)
            if "outbound" in webrtc_stats:
                if "audio" in webrtc_stats["outbound"]:
                    for key, value in webrtc_stats["outbound"]["audio"].items():
                        data_to_save[f"audio_{key}"] = value
                if "video" in webrtc_stats["outbound"]:
                    for key, value in webrtc_stats["outbound"]["video"].items():
                        data_to_save[f"video_{key}"] = value
                temp_outbound.append(data_to_save)

        df_inbound = pd.DataFrame(temp_inbound)
        if not df_inbound.empty:
            df_inbound["@timestamp"] = pd.to_datetime(df_inbound["@timestamp"])
            df_inbound.to_csv(
                f"dfs_final/{index_list_names[i]}-webrtc-stats-inbound.csv",
                index=False,
            )

        df_outbound = pd.DataFrame(temp_outbound)
        if not df_outbound.empty:
            df_outbound["@timestamp"] = pd.to_datetime(df_outbound["@timestamp"])
            df_outbound.to_csv(
                f"dfs_final/{index_list_names[i]}-webrtc-stats-outbound.csv", index=False
            )

In [None]:
from minio import Minio

client = Minio(
        os.getenv("MINIO_HOST"),
        access_key=os.getenv("MINIO_ACCESS_KEY"),
        secret_key=os.getenv("MINIO_SECRET_KEY"),
    )
prefix = "openvidu-loadtest-final-"

objects_names = pd.DataFrame(columns=["index", "session", "userFrom", "userTo", "error"])
for i, index in enumerate(index_list_names):
    index_split = index.split("-")
    bucket_name = prefix
    bucket_name += "livekit-"

    bucket_name += index_split[4] + "-"
    if "s" in index_split[5]:
        bucket_name += index_split[5] + "-t3medium"
    else:
        bucket_name += "t3medium"
    bucket_name += "-retry"
    
    found = client.bucket_exists(bucket_name)
    if not found:
        raise Exception("Bucket does not exist")

    objects = client.list_objects(bucket_name, recursive=False)
    for obj in objects:
        name = obj.object_name
        split = name.split("_")
        if "error" in name:
            objects_names.loc[len(objects_names.index)] = {
                "index": index,
                "session": split[3],
                "userFrom": split[4],
                "userTo": split[5].split(".")[0],
                "error": True
            }
        else:
            objects_names.loc[len(objects_names.index)] = {
                "index": index,
                "session": split[1],
                "userFrom": split[2],
                "userTo": split[3].split(".")[0],
                "error": False
            }
        
objects_names.to_csv(f"dfs_final/minio_items_lk.csv", index=False)