In [None]:
import os

data_path = "../data/minimized_run_5/1732116960_yolo11n_160/"

folder_path = data_path
file_sizes = {file: os.path.getsize(os.path.join(folder_path, file)) / (1024 * 1024) for file in os.listdir(folder_path)}

sorted_files_by_size = dict(sorted(file_sizes.items(), key=lambda item: item[1]))

sorted_files_by_size

In [None]:
import pandas as pd
import os

folder_path = data_path
worker_dataframes = [pd.read_feather(os.path.join(folder_path, file)) for file in os.listdir(folder_path) if file.startswith('worker') and file.endswith('.feather')]
for col in worker_dataframes[0].columns:
    print(col)
# worker_dataframes[0].columns

# kepler_process_package_joules_total -> Idle + dynamic + pid (int) + container_id (text or hex) + command (text)
# --> command: python3.8
# --> command: runc:[1:CHILD]
# kepler_process_cpu_instructions_total -> command, container_id, instance, pid
# kepler_container_package_joules_total -> container_id, container_name, container_namespace, instance, dynamic/idle, pod_name

# node_network_transmit_* -> device: lo/tunl0 or cali{hex} or enp88s0
# node_network_send_* -> device: lo/tunl0 or cali{hex} or enp88s0
# node_network_receive_* -> device: lo/tunl0 or cali{hex} or enp88s0

# instance:node_cpu_* -> instance
# instance:node_memory_* -> instance
# instance:node_network_* -> instance
# instance:* -> instance
# node_* -> instance
# node_cpu_seconds_total -> cpu, instance, mode

# Summary:
# Can use: kepler_container_* > container_namespace: workloadb
# Cannot use: kepler_process_* -> too many IDs and must guess the workload process (python3.8 is the best guess + subprocesses?)

# Idea: Compare global worker stats to container stats for predicting the application (or: worker stats vs container stats)

In [None]:
import json

def try_load_json(column_json: str):
    try:
        return json.loads(column_json)
    except json.JSONDecodeError:
        return None

def rename_workers(column_json: str):
    # Affects key-pairs: "instance":"worker1"
    #
    # Rename instances (worker1,worker2,worker3,worker4,worker5) -> "worker"
    for i in range(1,6):
        column_json = column_json.replace("worker" + str(i), "worker")
    return column_json

def filter_namespaces(column_json: str):
    # Filter out key-pairs such as: "container_namespace":"kube-system"
    json_obj = try_load_json(column_json)
    # Accept all columns that cannot be parsed as json
    if json_obj is None:
        return True
    # Accept all columns that do not have a namespace
    if "container_namespace" not in json_obj:
        return True
    # If the column has namespace, accept only the "workloadb" namespace
    container_namespace: str = json_obj["container_namespace"]
    if container_namespace == "workloadb":  # All YOLO workload applications run under the "workloadb" namespace
        return True
    # Filter out all other namespaces
    return False

def filter_go_specific(column_json: str):
    # Filter out columns related to go-language garbage collection (e.g., go_memstats_last_gc_time_seconds)
    json_obj = try_load_json(column_json)
    # Accept all columns that cannot be parsed as json
    if json_obj is None:
        return True
    # Accept all columns, except go specific columns
    column_name: str = json_obj["__name__"]
    if not column_name.startswith("go_"):
        return True
    # Filter out all other namespaces
    return False

def filter_process_specific(column_json: str):
    # Filter out columns about kepler_process_*, since they have very specific process ids and container ids.
    # These processes are also included in the kepler_container.
    json_obj = try_load_json(column_json)
    # Accept all columns that cannot be parsed as json
    if json_obj is None:
        return True
    # Accept all columns, except process specific columns
    column_name: str = json_obj["__name__"]
    if not column_name.startswith("kepler_process_"):
        return True
    # Filter out all other namespaces
    return False

def filter_durations(column_json: str):
    # Filter out columns like:
    # - go_gc_duration_seconds_count
    # - go_gc_duration_seconds_sum
    # - node_scrape_collector_duration_seconds
    # - scrape_duration_seconds
    # - go_gc_duration_seconds

    json_obj = try_load_json(column_json)
    # Accept all columns that cannot be parsed as json
    if json_obj is None:
        return True
    # Accept all columns, columns with "duration"
    column_name: str = json_obj["__name__"]
    if "_duration_" not in column_name:
        return True
    # Filter out all other namespaces
    return False

def rename_container_columns(column_json: str):
    """
    Removes "container_id" and "pod_name" from the column JSON string.

    Example: {"__name__":"kepler_container_cpu_instructions_total","container_id":"e4f3637bcbcb4fa1db77b269c7a1eec025fce7bb982e38b4ba668804f371b90f","container_name":"yolo-consumer","container_namespace":"workloadb","instance":"worker","pod_name":"yolo-consumer-64478765c9-k5bvh"}
    """
    json_obj = try_load_json(column_json)
    # Do nothing if the column cannot be parsed as json
    if json_obj is None:
        return column_json

    # Remove "container_id" and "pod_name" key-value pairs
    json_obj.pop("container_id", None)
    json_obj.pop("pod_name", None)

    # Return the JSON object as a string
    return json.dumps(json_obj)

"""
Convert monontonically increasing cols to rates (e.g., total joules to joules per unit of time)
"""
def get_counters(df):
    return [col for col in df.columns if df[col].is_monotonic_increasing]

def convert_to_rates(df, counters):
    rate_dataframes = []
    for counter in counters:
        json_obj = try_load_json(counter)
        if json_obj is None:
            continue
        name = json_obj["__name__"]
        new_name = name + "_rate"
        new_name = new_name.replace("_total_", "_")  # Some counters have "total" in their name
        json_obj["__name__"] = new_name
        new_column = json.dumps(json_obj)
        rate_dataframes.append(df[counter].diff().rename(new_column))

    # Combine the existing dataframe with the new rate dataframes
    df = pd.concat([df] + rate_dataframes, axis=1)
    return df


In [None]:
df = worker_dataframes[0]

""" Renaming columns to more generalized format (e.g., "worker" instead of "worker1") """
df.columns = [rename_workers(col) for col in df.columns]
df.columns = [rename_container_columns(col) for col in df.columns]

""" Filtering out columns that are useless or difficult to use (e.g., process specific metrics) """
filtered_columns = [col for col in df.columns if filter_namespaces(col)]
df = df[filtered_columns]

filtered_columns = [col for col in df.columns if filter_go_specific(col)]
df = df[filtered_columns]

filtered_columns = [col for col in df.columns if filter_process_specific(col)]
df = df[filtered_columns]

filtered_columns = [col for col in df.columns if filter_durations(col)]
df = df[filtered_columns]

""" Convert monontonically increasing cols to rates (counters to gauges) (e.g., total joules to joules per unit of time) """
counters = get_counters(df)
df = convert_to_rates(df, counters)
filtered_columns = [col for col in df.columns if col not in counters]
df = df[filtered_columns]
df = df.copy()  # Get defragmented copy of the df after all the edits


""" Print remaining cols """
for col in df.columns:
    print(col)

In [None]:
"""
Get and print all monotonic columns (e.g., counters)

These cannot be used directly for machine learning, since they are specific to how long the cluster/experiment has been running.
"""
counters = []
for col in df.columns:
    if df[col].is_monotonic_increasing:
        counters.append(col)
        print(col)